List of usage examples for org.apache.lucene.index IndexWriterConfig setSimilarity
public IndexWriterConfig setSimilarity(Similarity similarity)
From source file:org.ihtsdo.otf.query.lucene.LuceneIndexer.java
License:Apache License
protected LuceneIndexer(String indexName) throws IOException { try {/* ww w. j a v a2 s .c om*/ indexName_ = indexName; luceneWriterService = LookupService.getService(WorkExecutors.class).getExecutor(); luceneWriterFutureCheckerService = Executors.newFixedThreadPool(1, new NamedThreadFactory(indexName + " Lucene future checker", false)); Path searchFolder = LookupService.getService(ConfigurationService.class).getSearchFolderPath(); if (luceneRootFolder_.compareAndSet(null, new File(searchFolder.toFile(), DEFAULT_LUCENE_FOLDER))) { luceneRootFolder_.get().mkdirs(); } indexFolder_ = new File(luceneRootFolder_.get(), indexName); indexFolder_.mkdirs(); log.info("Index: " + indexFolder_.getAbsolutePath()); Directory indexDirectory = new MMapDirectory(indexFolder_); //switch over to MMapDirectory - in theory - this gives us back some //room on the JDK stack, letting the OS directly manage the caching of the index files - and more importantly, gives us a huge //performance boost during any operation that tries to do multi-threaded reads of the index (like the SOLOR rules processing) because //the default value of SimpleFSDirectory is a huge bottleneck. indexDirectory.clearLock("write.lock"); IndexWriterConfig config = new IndexWriterConfig(luceneVersion, new PerFieldAnalyzer()); config.setRAMBufferSizeMB(256); MergePolicy mergePolicy = new LogByteSizeMergePolicy(); config.setMergePolicy(mergePolicy); config.setSimilarity(new ShortTextSimilarity()); IndexWriter indexWriter = new IndexWriter(indexDirectory, config); trackingIndexWriter = new TrackingIndexWriter(indexWriter); boolean applyAllDeletes = false; searcherManager = new SearcherManager(indexWriter, applyAllDeletes, null); // [3]: Create the ControlledRealTimeReopenThread that reopens the index periodically taking into // account the changes made to the index and tracked by the TrackingIndexWriter instance // The index is refreshed every 60sc when nobody is waiting // and every 100 millis whenever is someone waiting (see search method) // (see http://lucene.apache.org/core/4_3_0/core/org/apache/lucene/search/NRTManagerReopenThread.html) reopenThread = new ControlledRealTimeReopenThread<>(trackingIndexWriter, searcherManager, 60.00, 0.1); this.startThread(); //Register for commits: log.info("Registering indexer " + getIndexerName() + " for commits"); Get.commitService().addChangeListener(new ChronologyChangeListener() { @Override public void handleCommit(CommitRecord commitRecord) { commitRecord.getSememesInCommit().stream().forEach(sememeId -> { handleChange(Get.sememeService().getSememe(sememeId)); }); } @Override public void handleChange(SememeChronology<? extends SememeVersion<?>> sc) { log.info("submitting sememe " + sc.toUserString() + " to indexer " + getIndexerName() + " due to commit"); index(sc); } @Override public void handleChange(ConceptChronology<? extends StampedVersion> cc) { // noop } @Override public UUID getListenerUuid() { return UuidT5Generator.get(getIndexerName()); } }); } catch (Exception e) { LookupService.getService(SystemStatusService.class).notifyServiceConfigurationFailure(indexName, e); throw e; } }
From source file:org.neo4j.index.impl.lucene.legacy.WritableIndexReferenceFactory.java
License:Open Source License
private IndexWriter newIndexWriter(IndexIdentifier identifier) { try {/*from www . j a v a 2 s . c o m*/ Directory indexDirectory = getIndexDirectory(identifier); IndexType type = getType(identifier); IndexWriterConfig writerConfig = new IndexWriterConfig(type.analyzer); writerConfig.setIndexDeletionPolicy(new MultipleBackupDeletionPolicy()); Similarity similarity = type.getSimilarity(); if (similarity != null) { writerConfig.setSimilarity(similarity); } return new IndexWriter(indexDirectory, writerConfig); } catch (IOException e) { throw new RuntimeException(e); } }
From source file:org.neo4j.index.impl.lucene.LuceneDataSource.java
License:Open Source License
private/*synchronized elsewhere*/IndexWriter newIndexWriter(IndexIdentifier identifier) { assertNotClosed();//from w w w . ja v a2s .co m try { Directory dir = filesystemFacade.getDirectory(baseStorePath, identifier); //getDirectory( // baseStorePath, identifier ); directoryExists(dir); IndexType type = getType(identifier, false); IndexWriterConfig writerConfig = new IndexWriterConfig(LUCENE_VERSION, type.analyzer); writerConfig.setIndexDeletionPolicy(new MultipleBackupDeletionPolicy()); Similarity similarity = type.getSimilarity(); if (similarity != null) { writerConfig.setSimilarity(similarity); } IndexWriter indexWriter = new IndexWriter(dir, writerConfig); // TODO We should tamper with this value and see how it affects the // general performance. Lucene docs says rather <10 for mixed // reads/writes // writer.setMergeFactor( 8 ); return indexWriter; } catch (IOException e) { throw new RuntimeException(e); } }
From source file:org.waveprotocol.box.server.search.LuceneSearchImpl.java
License:Apache License
@Inject public LuceneSearchImpl(IndexDirectory directory, WaveMap waveMap, WaveDigester digester, TextCollator textCollator, @Named(CoreSettings.WAVE_SERVER_DOMAIN) final String waveDomain, @ExecutorAnnotations.IndexExecutor ScheduledExecutorService indexExecutor) { this.textCollator = textCollator; this.waveMap = waveMap; this.digester = digester; this.waveDomain = waveDomain; this.indexExecutor = indexExecutor; sharedDomainParticipant = ParticipantIdUtil.makeUnsafeSharedDomainParticipantId(waveDomain).getAddress(); analyzer = new StandardAnalyzer(LUCENE_VERSION, StandardAnalyzer.STOP_WORDS_SET); similarity = new DefaultSimilarity() { @Override/*from w ww .j a v a 2 s .c o m*/ public float computeNorm(String string, FieldInvertState fis) { return fis.getBoost(); } @Override public float tf(float freq) { return freq > 0 ? 1.0f : 0.0f; } @Override public float tf(int freq) { return freq > 0 ? 1 : 0; } }; try { IndexWriterConfig config = new IndexWriterConfig(LUCENE_VERSION, analyzer); config.setOpenMode(OpenMode.CREATE_OR_APPEND); config.setSimilarity(similarity); indexWriter = new IndexWriter(directory.getDirectory(), config); nrtManager = new NRTManager(indexWriter, new WaveSearchWarmer(waveDomain)); } catch (IOException ex) { throw new IndexException(ex); } nrtManagerReopenThread = new NRTManagerReopenThread(nrtManager, MAX_STALE_SEC, MIN_STALE_SEC); nrtManagerReopenThread.start(); ShutdownManager.getInstance().register(new Shutdownable() { @Override public void shutdown() throws Exception { synchronized (LuceneSearchImpl.this) { if (!isClosed) { close(); } } } }, LuceneSearchImpl.class.getSimpleName(), ShutdownPriority.Storage); }
From source file:pretraga.IsolationSimilarity.java
public void addDocument(String indexName, String documentPath) { try {/*from w w w.j a va2 s .c o m*/ Directory dir = FSDirectory.open(new File(indexDirectoryPath).toPath()); IndexWriterConfig config = new IndexWriterConfig(this.analyzer); config.setSimilarity(new ClassicSimilarity()); IndexWriter writer = new IndexWriter(dir, config); Document doc = new Document(); File file = new File(documentPath); String fileSize = String.valueOf(file.length()); String content = new String(Files.readAllBytes(file.toPath()), StandardCharsets.UTF_8); BytesRef by = new BytesRef(fileSize.getBytes()); FieldType textType = new FieldType(TextField.TYPE_STORED); textType.setStored(true); textType.setStoreTermVectors(true); textType.setStoreTermVectorOffsets(true); doc.add(new LongField(SIZE, file.length(), Field.Store.YES)); doc.add(new Field(TITLE, indexName, textType)); doc.add(new Field(CONTENT, content, textType)); writer.addDocument(doc); writer.close(); dir.close(); } catch (Exception e) { System.err.println(e.toString()); } }
From source file:proj.zoie.impl.indexing.internal.DiskSearchIndex.java
License:Apache License
/** * Opens an index modifier.//from w w w . j av a 2s . c o m * @param analyzer Analyzer * @return IndexModifer instance */ public IndexWriter openIndexWriter(Analyzer analyzer, Similarity similarity) throws IOException { if (_indexWriter != null) return _indexWriter; Directory directory = _dirMgr.getDirectory(true); log.info("opening index writer at: " + _dirMgr.getPath()); ZoieMergePolicy mergePolicy = new ZoieMergePolicy(); mergePolicy.setMergePolicyParams(_mergePolicyParams); // hao: autocommit is set to false with this constructor IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_34, analyzer); config.setOpenMode(OpenMode.CREATE_OR_APPEND); config.setIndexDeletionPolicy(_deletionPolicy); config.setMergeScheduler(_mergeScheduler); config.setMergePolicy(mergePolicy); config.setReaderPooling(false); if (similarity != null) { config.setSimilarity(similarity); } config.setRAMBufferSizeMB(5); IndexWriter idxWriter = new IndexWriter(directory, config); _indexWriter = idxWriter; return idxWriter; }
From source file:proj.zoie.impl.indexing.internal.RAMSearchIndex.java
License:Apache License
public IndexWriter openIndexWriter(Analyzer analyzer, Similarity similarity) throws IOException { if (_indexWriter != null) return _indexWriter; ZoieMergePolicy mergePolicy = new ZoieMergePolicy(); mergePolicy.setMergePolicyParams(_mergePolicyParams); mergePolicy.setUseCompoundFile(false); IndexWriterConfig config = indexWriterConfigStorage.get(); if (config == null) { config = new IndexWriterConfig(Version.LUCENE_34, analyzer); indexWriterConfigStorage.set(config); }/* w w w . ja va 2 s .com*/ config.setOpenMode(OpenMode.CREATE_OR_APPEND); config.setMergeScheduler(_mergeScheduler); config.setMergePolicy(mergePolicy); config.setReaderPooling(false); if (similarity != null) { config.setSimilarity(similarity); } config.setRAMBufferSizeMB(3); IndexWriter idxWriter = new IndexWriter(_directory, config); _indexWriter = idxWriter; return idxWriter; }
From source file:reviewclassification.ReviewClassification.java
/** * Makes predictions for a test set. Saves them in a hash map that is later written in a file, in ascending file name order. Uses cos_score by default. * @param training_set The training set//from ww w . j a va 2 s. c o m * @param query_set The test set * @param threshold The threshold used for queries and predictions * @param filename The name of the file that holds the results * @throws org.apache.lucene.queryparser.classic.ParseException * @throws IOException */ public static void predictTestSet(ArrayList<Document> training_set, ArrayList<Document> query_set, int threshold, String filename) throws org.apache.lucene.queryparser.classic.ParseException, IOException { Similarity cos_sim = new ClassicSimilarity(); FileWriter outfile = new FileWriter(filename); HashMap<String, Boolean> predictions = new HashMap<>(); tlog(ft, "Bulding document index."); //Lucene stuff, building an analyzer, an index, creating a configuration, making an index writer, writing documents to the index, making a reader, and a searcher from the reader. //Setting cosine similarity as the similarity method in the configuration and the searcher. StandardAnalyzer analyzer = new StandardAnalyzer(); Directory index = new RAMDirectory(); IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setSimilarity(cos_sim); IndexWriter w = new IndexWriter(index, config); addDocuments(w, training_set); w.close(); IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(cos_sim); tlog(ft, "Done bulding index. Predicting the test set."); //For each review in the test set, query the index, get the results, and predict with a given threshold. //Then add the prediction to the hash map. The key is the name of the file. We only have the path, so we split it, get the filename, and remove the extension. for (Document doc : query_set) { ArrayList<Result> results = query(doc, analyzer, searcher, threshold); boolean cos_pred = predict(results, doc, threshold, COS_SCORE); String[] str = doc.get("path").split("/"); predictions.put(str[str.length - 1].split("\\.")[0], cos_pred); } //Sort files in file name ascending order tlog(ft, "Done predicting test set. Sorting files."); ArrayList<String> files = new ArrayList<>(predictions.keySet()); files.sort(new Comparator() { @Override public int compare(Object o1, Object o2) { String s1 = (String) o1; String s2 = (String) o2; return s1.compareTo(s2); } }); tlog(ft, "Done sorting files. Writing to disk."); //Write results to disk for (String s : files) { outfile.write(s + " " + boolToInt(predictions.get(s)) + System.lineSeparator()); } outfile.close(); tlog(ft, "Done writing to disk. Results in: " + filename); }
From source file:reviewclassification.ReviewClassification.java
/** * Uses 9/10 of the training set as the training set, and 1/10 as the test set, chosen randomly. * Makes predictions for all 3 scoring methods and for multiple thresholds, to decide the best scoring method and the best threshold to use. * @param documents The training set, which will be divided to create a test set * @param threshold_start The minimum threshold * @param threshold_end The maximum threshold * @param filename The name of the file that holds the results * @throws IOException/*w ww.jav a 2 s . com*/ * @throws org.apache.lucene.queryparser.classic.ParseException */ public static void accuracyTest(ArrayList<Document> documents, int threshold_start, int threshold_end, String filename) throws IOException, org.apache.lucene.queryparser.classic.ParseException { long seed = System.nanoTime(); Collections.shuffle(documents, new Random(seed)); FileWriter outfile = new FileWriter(filename); //9/10 of the training set is used for training //The remaining 1/10 is used for testing ArrayList<Document> training_set = new ArrayList<>(documents.subList(0, documents.size() * 9 / 10)); ArrayList<Document> test_set = new ArrayList<>( documents.subList(documents.size() * 9 / 10, documents.size())); //Metrics objects hold tp, fp, tn, and fn counters. We keep one for each threshold. We are testing with 3 scoring methods, so we need 3 lists of objects, //which contain an object for each threshold. ArrayList<Integer> threshold_list = new ArrayList<>(); ArrayList<Metrics> metrics_list_knn = new ArrayList<>(); ArrayList<Metrics> metrics_list_knn_sentiment = new ArrayList<>(); ArrayList<Metrics> metrics_list_cos_score = new ArrayList<>(); //Initializing the metrics objects. for (int i = threshold_start; i <= threshold_end; i++) { threshold_list.add(i); metrics_list_knn.add(new Metrics()); metrics_list_knn_sentiment.add(new Metrics()); metrics_list_cos_score.add(new Metrics()); } //Built-in cosine similarity method. Similarity cos_sim = new ClassicSimilarity(); tlog(ft, "Bulding document index."); //Lucene stuff, building an analyzer, an index, creating a configuration, making an index writer, writing documents to the index, making a reader, and a searcher from the reader. //Setting cosine similarity as the similarity method in the configuration and the searcher. StandardAnalyzer analyzer = new StandardAnalyzer(); Directory index = new RAMDirectory(); IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setSimilarity(cos_sim); IndexWriter w = new IndexWriter(index, config); addDocuments(w, training_set); w.close(); IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(cos_sim); tlog(ft, "Done bulding index. Predicting the test set."); //For each review in the test set, query the index, get the results, then predict with a given threshold. //Testing for multiple thresholds to find which one to use. for (Document doc : test_set) { ArrayList<Result> results = query(doc, analyzer, searcher, threshold_list.get(threshold_list.size() - 1)); boolean query_class = doc.get("path").contains("pos"); //We execute the query only once, then for each threshold count the results with the appropriate metrics object. for (int i = 0; i < threshold_list.size(); i++) { boolean knn_pred = predict(results, doc, threshold_list.get(i), KNN); boolean knn_senti_pred = predict(results, doc, threshold_list.get(i), KNN_SENTIMENT); boolean cos_pred = predict(results, doc, threshold_list.get(i), COS_SCORE); update_metrics(metrics_list_knn.get(i), query_class, knn_pred); update_metrics(metrics_list_knn_sentiment.get(i), query_class, knn_senti_pred); update_metrics(metrics_list_cos_score.get(i), query_class, cos_pred); } } tlog(ft, "Done predicting test set. Calculating accuracies and writing to file."); //For each metrics object we call calculate(), which calculates the accuracy, then write it to file. for (int i = 0; i < threshold_list.size(); i++) { metrics_list_knn.get(i).calculate(); metrics_list_knn_sentiment.get(i).calculate(); metrics_list_cos_score.get(i).calculate(); outfile.write(threshold_list.get(i) + " " + metrics_list_knn.get(i).getAccuracy() + " " + metrics_list_knn_sentiment.get(i).getAccuracy() + " " + metrics_list_cos_score.get(i).getAccuracy() + System.lineSeparator()); } outfile.close(); tlog(ft, "Done writing to file. Results in: " + filename); }
From source file:semanticRelatedness.MakeLuceneIndex.java
License:Apache License
/** Index all text files under a directory. * @throws UnsupportedEncodingException * @throws FileNotFoundException */ public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException { String baseDir = "/home/chrisschaefer/"; //String wikiDumpFile = "Downloads/enwiki-20130604-pages-articles.xml.bz2"; String wikiDumpFile = "enwiki-20130604-pages-articlese.xml.bz2"; String luceneIndexName = "enwiki-20130604-lucene2"; System.currentTimeMillis();//from w ww .j a v a2 s. c om boolean bIgnoreStubs = false; for (int i = 0; i < args.length; ++i) { if (args[i].equals("-luceneindex")) luceneIndexName = args[++i]; if (args[i].equals("-basedir")) baseDir = args[++i]; if (args[i].equals("-dumpfile")) wikiDumpFile = args[++i]; if (args[i].equals("-includestubs")) bIgnoreStubs = true; } String rawTextPath = baseDir + luceneIndexName + "-raw-text.txt"; String logPath = baseDir + luceneIndexName + ".log"; PrintWriter artikelTextWriter = new PrintWriter(rawTextPath, "UTF-8"); PrintWriter logger = new PrintWriter(logPath, "UTF-8"); logger.println("Indexing to directory '" + baseDir + luceneIndexName + "'"); System.out.println("Indexing to directory '" + baseDir + luceneIndexName + "'"); Date start = new Date(); try { Directory dir = FSDirectory.open(new File(baseDir + luceneIndexName)); // Analyzer analyzer = new WikipediaAnalyzer(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer); // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); iwc.setSimilarity(new DefaultSimilarity()); //iwc.setSimilarity(new ESASimilarity()); // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmxm or -Xmx1g): // iwc.setRAMBufferSizeMB(2000.0); IndexWriter writer = new IndexWriter(dir, iwc); Extractor wikidumpExtractor = new Extractor(baseDir + File.separator + wikiDumpFile); wikidumpExtractor.setLinkSeparator("_"); wikidumpExtractor.setCategorySeparator("_"); wikidumpExtractor.setTitleSeparator(" "); int iStubs = 0; int iArticleCount = 0; int iSkippedPageCount = 0; long iStartTime = java.lang.System.nanoTime(); long iTime = iStartTime; while (wikidumpExtractor.nextPage()) { if (wikidumpExtractor.getPageType() != Extractor.PageType.ARTICLE) { ++iSkippedPageCount; continue; } if (bIgnoreStubs && wikidumpExtractor.getStub()) { ++iStubs; continue; } // skip pages with less than 5 out links if (wikidumpExtractor.getPageLinkList(true).size() < 5) { ++iSkippedPageCount; continue; } if (wikidumpExtractor.getPageCategories().equals("")) { ++iSkippedPageCount; logger.println("skipped because of stop category: " + wikidumpExtractor.getPageTitle(false)); continue; } else { for (String link : wikidumpExtractor.getPageLinkList(false)) { // artikelTextWriter.println(link); if (_inLinks.containsKey(link)) { int tmp = _inLinks.get(link); tmp++; _inLinks.put(link, tmp); } else { _inLinks.put(link, 1); } } } if (wikidumpExtractor.getPageText().equals("")) { ++iSkippedPageCount; continue; } artikelTextWriter.println( wikidumpExtractor.getPageTitle(false) + "\t" + wikidumpExtractor.getPageText(false)); ++iArticleCount; if (iArticleCount % 1000 == 0) { logger.println(new Date().toString() + " phase 1 -- iArticleCount: " + iArticleCount + " iSkippedPageCount: " + iSkippedPageCount); } } artikelTextWriter.close(); iArticleCount = 0; PrintWriter artikelInLinkWriter = new PrintWriter(baseDir + luceneIndexName + "-inlinks.txt", "UTF-8"); BufferedReader br = new BufferedReader(new FileReader(rawTextPath)); String line = br.readLine(); while (line != null) { int endOfTitle = line.indexOf("\t"); String title = line.substring(0, endOfTitle); if (_inLinks.containsKey(title)) { int inlinks = _inLinks.get(title); artikelInLinkWriter.println(title + "\t" + inlinks); if (inlinks > 4) { //System.out.println("inlinks > 0 "); Document doc = new Document(); ++iArticleCount; // wikidumpExtractor.setTitleSeparator( "_" ); // doc.add( new TextField( "url_title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) ); // doc.add( new TextField( "title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) ); //doc.add(new LongField("wiki_id", wikidumpExtractor.getPageId(), Field.Store.YES)); doc.add(new TextField("contents", title + " " + title + " " + title + " " + title + " " + line.substring(endOfTitle + 1), Field.Store.NO)); // System.out.println(title + " " + // title + " " + // title + " " + // title + " " + // line.substring(endOfTitle+1)); writer.addDocument(doc); if (iArticleCount % 1000 == 0) { writer.commit(); logger.println(new Date().toString() + " phase 2 -- iArticleCount: " + iArticleCount + " iSkippedPageCount: " + iSkippedPageCount); } } } else { artikelInLinkWriter.println(title + "\t0"); } line = br.readLine(); } br.close(); artikelInLinkWriter.close(); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // writer.commit(); writer.forceMerge(1); writer.close(); Date end = new Date(); String endStatement = end.getTime() - start.getTime() + " total milliseconds (" + (end.getTime() - start.getTime()) / 3600000.0 + " hours), " + iArticleCount + " Articles."; logger.println(endStatement); System.out.println(endStatement); logger.close(); } catch (Exception e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }