List of usage examples for org.apache.lucene.search IndexSearcher setSimilarity
public void setSimilarity(Similarity similarity)
From source file:org.hibernate.search.query.engine.impl.HSQueryImpl.java
License:Open Source License
/** * Build the index searcher for this fulltext query. * * @param searchFactoryImplementor the search factory. * @param forceScoring if true, force SCORE computation, if false, force not to compute score, if null used best choice * * @return the <code>IndexSearcher</code> for this query (can be <code>null</code>. * TODO change classesAndSubclasses by side effect, which is a mismatch with the Searcher return, fix that. *///from w w w. ja v a 2 s. co m private IndexSearcherWithPayload buildSearcher(SearchFactoryImplementor searchFactoryImplementor, Boolean forceScoring) { Map<Class<?>, EntityIndexBinder<?>> builders = searchFactoryImplementor.getIndexBindingForEntity(); List<IndexManager> targetedIndexes = new ArrayList<IndexManager>(); Set<String> idFieldNames = new HashSet<String>(); Similarity searcherSimilarity = null; //TODO check if caching this work for the last n list of indexedTargetedEntities makes a perf boost if (indexedTargetedEntities.size() == 0) { // empty indexedTargetedEntities array means search over all indexed entities, // but we have to make sure there is at least one if (builders.isEmpty()) { throw new SearchException( "There are no mapped entities. Don't forget to add @Indexed to at least one class."); } for (EntityIndexBinder indexBinder : builders.values()) { DocumentBuilderIndexedEntity<?> builder = indexBinder.getDocumentBuilder(); searcherSimilarity = checkSimilarity(searcherSimilarity, builder); if (builder.getIdKeywordName() != null) { idFieldNames.add(builder.getIdKeywordName()); allowFieldSelectionInProjection = allowFieldSelectionInProjection && builder.allowFieldSelectionInProjection(); } useFieldCacheOnClassTypes = useFieldCacheOnClassTypes || builder.getFieldCacheOption().contains(FieldCacheType.CLASS); populateIndexManagers(targetedIndexes, indexBinder.getSelectionStrategy()); } classesAndSubclasses = null; } else { Set<Class<?>> involvedClasses = new HashSet<Class<?>>(indexedTargetedEntities.size()); involvedClasses.addAll(indexedTargetedEntities); for (Class<?> clazz : indexedTargetedEntities) { EntityIndexBinder<?> indexBinder = builders.get(clazz); if (indexBinder != null) { DocumentBuilderIndexedEntity<?> builder = indexBinder.getDocumentBuilder(); involvedClasses.addAll(builder.getMappedSubclasses()); } } for (Class clazz : involvedClasses) { EntityIndexBinder indexBinder = builders.get(clazz); //TODO should we rather choose a polymorphic path and allow non mapped entities if (indexBinder == null) { throw new SearchException("Not a mapped entity (don't forget to add @Indexed): " + clazz); } DocumentBuilderIndexedEntity<?> builder = indexBinder.getDocumentBuilder(); if (builder.getIdKeywordName() != null) { idFieldNames.add(builder.getIdKeywordName()); allowFieldSelectionInProjection = allowFieldSelectionInProjection && builder.allowFieldSelectionInProjection(); } searcherSimilarity = checkSimilarity(searcherSimilarity, builder); useFieldCacheOnClassTypes = useFieldCacheOnClassTypes || builder.getFieldCacheOption().contains(FieldCacheType.CLASS); populateIndexManagers(targetedIndexes, indexBinder.getSelectionStrategy()); } this.classesAndSubclasses = involvedClasses; } this.idFieldNames = idFieldNames; //compute optimization needClassFilterClause //if at least one DP contains one class that is not part of the targeted classesAndSubclasses we can't optimize if (classesAndSubclasses != null) { for (IndexManager indexManager : targetedIndexes) { final Set<Class<?>> classesInIndexManager = indexManager.getContainedTypes(); // if an IndexManager contains only one class, we know for sure it's part of classesAndSubclasses if (classesInIndexManager.size() > 1) { //risk of needClassFilterClause for (Class clazz : classesInIndexManager) { if (!classesAndSubclasses.contains(clazz)) { this.needClassFilterClause = true; break; } } } if (this.needClassFilterClause) { break; } } } else { Map<Class<?>, EntityIndexBinder<?>> documentBuildersIndexedEntities = searchFactoryImplementor .getIndexBindingForEntity(); this.classesAndSubclasses = documentBuildersIndexedEntities.keySet(); } //set up the searcher final IndexManager[] indexManagers = targetedIndexes.toArray(new IndexManager[targetedIndexes.size()]); IndexSearcher is = new IndexSearcher(MultiReaderFactory.openReader(indexManagers)); is.setSimilarity(searcherSimilarity); //handle the sort and projection final String[] projection = this.projectedFields; if (Boolean.TRUE.equals(forceScoring)) { return new IndexSearcherWithPayload(is, true, true); } else if (Boolean.FALSE.equals(forceScoring)) { return new IndexSearcherWithPayload(is, false, false); } else if (this.sort != null && projection != null) { boolean activate = false; for (String field : projection) { if (SCORE.equals(field)) { activate = true; break; } } if (activate) { return new IndexSearcherWithPayload(is, true, false); } } //default return new IndexSearcherWithPayload(is, false, false); }
From source file:org.neo4j.index.impl.lucene.legacy.IndexReferenceFactory.java
License:Open Source License
IndexSearcher newIndexSearcher(IndexIdentifier identifier, IndexReader reader) { IndexSearcher searcher = new IndexSearcher(reader); IndexType type = getType(identifier); if (type.getSimilarity() != null) { searcher.setSimilarity(type.getSimilarity()); }/*www . ja v a 2 s.c o m*/ return searcher; }
From source file:org.neo4j.index.impl.lucene.LuceneDataSource.java
License:Open Source License
private IndexSearcher newIndexSearcher(IndexIdentifier identifier, IndexReader reader) { IndexSearcher searcher = new IndexSearcher(reader); IndexType type = getType(identifier, false); if (type.getSimilarity() != null) { searcher.setSimilarity(type.getSimilarity()); }//from w ww .j a v a 2s . co m return searcher; }
From source file:org.waveprotocol.box.server.search.LuceneSearchImpl.java
License:Apache License
@Override public SearchResult search(String query, int startAt, int numResults, ParticipantId viewer) { LOG.fine("Search query '" + query + "' from user: " + viewer + " [" + startAt + ", " + (startAt + numResults - 1) + "]"); SearchResult result = new SearchResult(query); SearchQuery queryParams = queryParser.parseQuery(query); List<IndexCondition> indexConditions = SearchQueryHelper.convertToIndexQuery(queryParams, viewer, waveDomain);/*from www . ja v a 2 s .c o m*/ try { BooleanQuery allQuery = new BooleanQuery(); String fromParticipant = viewer.getAddress(); Query userQuery = null; if (!indexConditions.isEmpty()) { try { userQuery = makeQuery(indexConditions); } catch (ParseException ex) { LOG.log(Level.SEVERE, "Invalid query: " + query, ex); return result; } } if (userQuery == null || !SearchQueryHelper.withParticipant(indexConditions, sharedDomainParticipant)) { TermQuery participantQuery = new TermQuery( new Term(IndexCondition.Field.PARTICIPANTS.toString(), fromParticipant)); participantQuery.setBoost(0); allQuery.add(participantQuery, Occur.MUST); } if (userQuery != null) { userQuery.setBoost(1); allQuery.add(userQuery, Occur.MUST); for (IndexCondition condition : indexConditions) { if (condition.getField() == IndexCondition.Field.CONTENT) { IndexCondition titleCondition = new IndexCondition(IndexCondition.Field.TITLE, null, condition.getValue(), condition.isPhrase(), condition.isNot()); Query titleQuery = makeQuery(titleCondition); titleQuery.setBoost(2); allQuery.add(titleQuery, titleCondition.isNot() ? Occur.MUST_NOT : Occur.SHOULD); IndexCondition tagCondition = new IndexCondition(IndexCondition.Field.TAG, null, condition.getValue(), condition.isPhrase(), condition.isNot()); Query tagQuery = makeQuery(tagCondition); tagQuery.setBoost(3); allQuery.add(tagQuery, tagCondition.isNot() ? Occur.MUST_NOT : Occur.SHOULD); } } } LOG.fine("Search query " + allQuery.toString()); List<SortField> sortFields = new LinkedList<SortField>(); sortFields.add(SortField.FIELD_SCORE); sortFields.add(new SortField(IndexCondition.Field.LAST_MODIFIED.toString(), longParser, true)); SearcherManager searcherManager = nrtManager.getSearcherManager(true); IndexSearcher indexSearcher = searcherManager.acquire(); try { indexSearcher.setSimilarity(similarity); TopDocs hints = indexSearcher.search(allQuery, startAt + numResults, new Sort(sortFields.toArray(new SortField[sortFields.size()]))); for (int i = startAt; i < hints.scoreDocs.length; i++) { try { ScoreDoc hint = hints.scoreDocs[i]; Document doc = indexSearcher.doc(hint.doc); result.addDigest(parseDigest(doc, viewer)); } catch (IOException ex) { LOG.log(Level.SEVERE, "Get digest from index", ex); } } } finally { searcherManager.release(indexSearcher); } } catch (ParseException ex) { LOG.log(Level.SEVERE, "Search failed: " + query, ex); } catch (IOException ex) { LOG.log(Level.SEVERE, "Search failed: " + query, ex); } return result; }
From source file:perf.SearchPerfTest.java
License:Apache License
private static void _main(String[] clArgs) throws Exception { // args: dirImpl indexPath numThread numIterPerThread // eg java SearchPerfTest /path/to/index 4 100 final Args args = new Args(clArgs); Directory dir0;// w w w.jav a2 s. c om final String dirPath = args.getString("-indexPath") + "/index"; final String dirImpl = args.getString("-dirImpl"); OpenDirectory od = OpenDirectory.get(dirImpl); /* } else if (dirImpl.equals("NativePosixMMapDirectory")) { dir0 = new NativePosixMMapDirectory(new File(dirPath)); ramDir = null; if (doFacets) { facetsDir = new NativePosixMMapDirectory(new File(facetsDirPath)); } } else if (dirImpl.equals("CachingDirWrapper")) { dir0 = new CachingRAMDirectory(new MMapDirectory(new File(dirPath))); ramDir = null; } else if (dirImpl.equals("RAMExceptDirectPostingsDirectory")) { // Load only non-postings files into RAMDir (assumes // Lucene40PF is the wrapped PF): Set<String> postingsExtensions = new HashSet<String>(); postingsExtensions.add("frq"); postingsExtensions.add("prx"); postingsExtensions.add("tip"); postingsExtensions.add("tim"); ramDir = new RAMDirectory(); Directory fsDir = new MMapDirectory(new File(dirPath)); for (String file : fsDir.listAll()) { int idx = file.indexOf('.'); if (idx != -1 && postingsExtensions.contains(file.substring(idx+1, file.length()))) { continue; } fsDir.copy(ramDir, file, file, IOContext.READ); } dir0 = new FileSwitchDirectory(postingsExtensions, fsDir, ramDir, true); if (doFacets) { facetsDir = new RAMDirectory(new SimpleFSDirectory(new File(facetsDirPath)), IOContext.READ); } */ final RAMDirectory ramDir; dir0 = od.open(Paths.get(dirPath)); if (dir0 instanceof RAMDirectory) { ramDir = (RAMDirectory) dir0; } else { ramDir = null; } // TODO: NativeUnixDir? final String analyzer = args.getString("-analyzer"); final String tasksFile = args.getString("-taskSource"); final int searchThreadCount = args.getInt("-searchThreadCount"); final String fieldName = args.getString("-field"); final boolean printHeap = args.getFlag("-printHeap"); final boolean doPKLookup = args.getFlag("-pk"); final int topN = args.getInt("-topN"); final boolean doStoredLoads = args.getFlag("-loadStoredFields"); // Used to choose which random subset of tasks we will // run, to generate the PKLookup tasks, and to generate // any random pct filters: final long staticRandomSeed = args.getLong("-staticSeed"); // Used to shuffle the random subset of tasks: final long randomSeed = args.getLong("-seed"); // TODO: this could be way better. final String similarity = args.getString("-similarity"); // now reflect final Class<? extends Similarity> simClazz = Class .forName("org.apache.lucene.search.similarities." + similarity).asSubclass(Similarity.class); final Similarity sim = simClazz.newInstance(); System.out.println("Using dir impl " + dir0.getClass().getName()); System.out.println("Analyzer " + analyzer); System.out.println("Similarity " + similarity); System.out.println("Search thread count " + searchThreadCount); System.out.println("topN " + topN); System.out.println("JVM " + (Constants.JRE_IS_64BIT ? "is" : "is not") + " 64bit"); System.out.println("Pointer is " + RamUsageEstimator.NUM_BYTES_OBJECT_REF + " bytes"); final Analyzer a; if (analyzer.equals("EnglishAnalyzer")) { a = new EnglishAnalyzer(); } else if (analyzer.equals("ClassicAnalyzer")) { a = new ClassicAnalyzer(); } else if (analyzer.equals("StandardAnalyzer")) { a = new StandardAnalyzer(); } else if (analyzer.equals("StandardAnalyzerNoStopWords")) { a = new StandardAnalyzer(CharArraySet.EMPTY_SET); } else if (analyzer.equals("ShingleStandardAnalyzer")) { a = new ShingleAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), 2, 2, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, true, true, ShingleFilter.DEFAULT_FILLER_TOKEN); } else { throw new RuntimeException("unknown analyzer " + analyzer); } final ReferenceManager<IndexSearcher> mgr; final IndexWriter writer; final Directory dir; final String commit = args.getString("-commit"); final String hiliteImpl = args.getString("-hiliteImpl"); final String logFile = args.getString("-log"); final long tSearcherStart = System.currentTimeMillis(); final boolean verifyCheckSum = !args.getFlag("-skipVerifyChecksum"); final boolean recacheFilterDeletes = args.getFlag("-recacheFilterDeletes"); if (recacheFilterDeletes) { throw new UnsupportedOperationException("recacheFilterDeletes was deprecated"); } if (args.getFlag("-nrt")) { // TODO: get taxoReader working here too // TODO: factor out & share this CL processing w/ Indexer final int indexThreadCount = args.getInt("-indexThreadCount"); final String lineDocsFile = args.getString("-lineDocsFile"); final float docsPerSecPerThread = args.getFloat("-docsPerSecPerThread"); final float reopenEverySec = args.getFloat("-reopenEverySec"); final boolean storeBody = args.getFlag("-store"); final boolean tvsBody = args.getFlag("-tvs"); final boolean useCFS = args.getFlag("-cfs"); final String defaultPostingsFormat = args.getString("-postingsFormat"); final String idFieldPostingsFormat = args.getString("-idFieldPostingsFormat"); final boolean verbose = args.getFlag("-verbose"); final boolean cloneDocs = args.getFlag("-cloneDocs"); final Mode mode = Mode.valueOf(args.getString("-mode", "update").toUpperCase(Locale.ROOT)); final long reopenEveryMS = (long) (1000 * reopenEverySec); if (verbose) { InfoStream.setDefault(new PrintStreamInfoStream(System.out)); } if (!dirImpl.equals("RAMDirectory") && !dirImpl.equals("RAMExceptDirectPostingsDirectory")) { System.out.println("Wrap NRTCachingDirectory"); dir0 = new NRTCachingDirectory(dir0, 20, 400.0); } dir = dir0; final IndexWriterConfig iwc = new IndexWriterConfig(a); iwc.setOpenMode(IndexWriterConfig.OpenMode.APPEND); iwc.setRAMBufferSizeMB(256.0); iwc.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE); // TODO: also RAMDirExceptDirect...? need to // ... block deletes against wrapped FSDir? if (dirImpl.equals("RAMDirectory")) { // Let IW remove files only referenced by starting commit: iwc.setIndexDeletionPolicy(new KeepNoCommitsDeletionPolicy()); } if (commit != null && commit.length() > 0) { System.out.println("Opening writer on commit=" + commit); iwc.setIndexCommit(PerfUtils.findCommitPoint(commit, dir)); } ((TieredMergePolicy) iwc.getMergePolicy()).setNoCFSRatio(useCFS ? 1.0 : 0.0); //((TieredMergePolicy) iwc.getMergePolicy()).setMaxMergedSegmentMB(1024); //((TieredMergePolicy) iwc.getMergePolicy()).setReclaimDeletesWeight(3.0); //((TieredMergePolicy) iwc.getMergePolicy()).setMaxMergeAtOnce(4); final Codec codec = new Lucene62Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { return PostingsFormat .forName(field.equals("id") ? idFieldPostingsFormat : defaultPostingsFormat); } }; iwc.setCodec(codec); final ConcurrentMergeScheduler cms = (ConcurrentMergeScheduler) iwc.getMergeScheduler(); // Only let one merge run at a time... // ... but queue up up to 4, before index thread is stalled: cms.setMaxMergesAndThreads(4, 1); iwc.setMergedSegmentWarmer(new IndexWriter.IndexReaderWarmer() { @Override public void warm(LeafReader reader) throws IOException { final long t0 = System.currentTimeMillis(); //System.out.println("DO WARM: " + reader); IndexSearcher s = new IndexSearcher(reader); s.setQueryCache(null); // don't bench the cache s.search(new TermQuery(new Term(fieldName, "united")), 10); final long t1 = System.currentTimeMillis(); System.out.println("warm segment=" + reader + " numDocs=" + reader.numDocs() + ": took " + (t1 - t0) + " msec"); } }); writer = new IndexWriter(dir, iwc); System.out.println("Initial writer.maxDoc()=" + writer.maxDoc()); // TODO: add -nrtBodyPostingsOffsets instead of // hardwired false: boolean addDVFields = mode == Mode.BDV_UPDATE || mode == Mode.NDV_UPDATE; LineFileDocs lineFileDocs = new LineFileDocs(lineDocsFile, false, storeBody, tvsBody, false, cloneDocs, null, null, null, addDVFields); IndexThreads threads = new IndexThreads(new Random(17), writer, new AtomicBoolean(false), lineFileDocs, indexThreadCount, -1, false, false, mode, docsPerSecPerThread, null, -1.0, -1); threads.start(); mgr = new SearcherManager(writer, new SearcherFactory() { @Override public IndexSearcher newSearcher(IndexReader reader, IndexReader previous) { IndexSearcher s = new IndexSearcher(reader); s.setQueryCache(null); // don't bench the cache s.setSimilarity(sim); return s; } }); System.out.println("reopen every " + reopenEverySec); Thread reopenThread = new Thread() { @Override public void run() { try { final long startMS = System.currentTimeMillis(); int reopenCount = 1; while (true) { final long sleepMS = startMS + (reopenCount * reopenEveryMS) - System.currentTimeMillis(); if (sleepMS < 0) { System.out.println("WARNING: reopen fell behind by " + Math.abs(sleepMS) + " ms"); } else { Thread.sleep(sleepMS); } Thread.sleep(sleepMS); mgr.maybeRefresh(); reopenCount++; IndexSearcher s = mgr.acquire(); try { if (ramDir != null) { System.out.println(String.format(Locale.ENGLISH, "%.1fs: index: %d bytes in RAMDir; writer.maxDoc()=%d; searcher.maxDoc()=%d; searcher.numDocs()=%d", (System.currentTimeMillis() - startMS) / 1000.0, ramDir.ramBytesUsed(), writer.maxDoc(), s.getIndexReader().maxDoc(), s.getIndexReader().numDocs())); //String[] l = ramDir.listAll(); //Arrays.sort(l); //for(String f : l) { //System.out.println(" " + f + ": " + ramDir.fileLength(f)); //} } else { System.out.println(String.format(Locale.ENGLISH, "%.1fs: done reopen; writer.maxDoc()=%d; searcher.maxDoc()=%d; searcher.numDocs()=%d", (System.currentTimeMillis() - startMS) / 1000.0, writer.maxDoc(), s.getIndexReader().maxDoc(), s.getIndexReader().numDocs())); } } finally { mgr.release(s); } } } catch (Exception e) { throw new RuntimeException(e); } } }; reopenThread.setName("ReopenThread"); reopenThread.setPriority(4 + Thread.currentThread().getPriority()); reopenThread.start(); } else { dir = dir0; writer = null; final DirectoryReader reader; if (commit != null && commit.length() > 0) { System.out.println("Opening searcher on commit=" + commit); reader = DirectoryReader.open(PerfUtils.findCommitPoint(commit, dir)); } else { // open last commit reader = DirectoryReader.open(dir); } IndexSearcher s = new IndexSearcher(reader); s.setQueryCache(null); // don't bench the cache s.setSimilarity(sim); System.out.println("maxDoc=" + reader.maxDoc() + " numDocs=" + reader.numDocs() + " %tg deletes=" + (100. * reader.maxDoc() / reader.numDocs())); mgr = new SingleIndexSearcher(s); } System.out.println((System.currentTimeMillis() - tSearcherStart) + " msec to init searcher/NRT"); { IndexSearcher s = mgr.acquire(); try { System.out.println("Searcher: numDocs=" + s.getIndexReader().numDocs() + " maxDoc=" + s.getIndexReader().maxDoc() + ": " + s); } finally { mgr.release(s); } } //System.out.println("searcher=" + searcher); FacetsConfig facetsConfig = new FacetsConfig(); facetsConfig.setHierarchical("Date", true); TaxonomyReader taxoReader; Path taxoPath = Paths.get(args.getString("-indexPath"), "facets"); Directory taxoDir = od.open(taxoPath); if (DirectoryReader.indexExists(taxoDir)) { taxoReader = new DirectoryTaxonomyReader(taxoDir); System.out.println("Taxonomy has " + taxoReader.getSize() + " ords"); } else { taxoReader = null; } final Random staticRandom = new Random(staticRandomSeed); final Random random = new Random(randomSeed); final DirectSpellChecker spellChecker = new DirectSpellChecker(); final IndexState indexState = new IndexState(mgr, taxoReader, fieldName, spellChecker, hiliteImpl, facetsConfig); final QueryParser queryParser = new QueryParser("body", a); TaskParser taskParser = new TaskParser(indexState, queryParser, fieldName, topN, staticRandom, doStoredLoads); final TaskSource tasks; if (tasksFile.startsWith("server:")) { int idx = tasksFile.indexOf(':', 8); if (idx == -1) { throw new RuntimeException( "server is missing the port; should be server:interface:port (got: " + tasksFile + ")"); } String iface = tasksFile.substring(7, idx); int port = Integer.valueOf(tasksFile.substring(1 + idx)); RemoteTaskSource remoteTasks = new RemoteTaskSource(iface, port, searchThreadCount, taskParser); // nocommit must stop thread? tasks = remoteTasks; } else { // Load the tasks from a file: final int taskRepeatCount = args.getInt("-taskRepeatCount"); final int numTaskPerCat = args.getInt("-tasksPerCat"); tasks = new LocalTaskSource(indexState, taskParser, tasksFile, staticRandom, random, numTaskPerCat, taskRepeatCount, doPKLookup); System.out.println("Task repeat count " + taskRepeatCount); System.out.println("Tasks file " + tasksFile); System.out.println("Num task per cat " + numTaskPerCat); } args.check(); // Evil respeller: //spellChecker.setMinPrefix(0); //spellChecker.setMaxInspections(1024); final TaskThreads taskThreads = new TaskThreads(tasks, indexState, searchThreadCount); Thread.sleep(10); final long startNanos = System.nanoTime(); taskThreads.start(); taskThreads.finish(); final long endNanos = System.nanoTime(); System.out.println("\n" + ((endNanos - startNanos) / 1000000.0) + " msec total"); final List<Task> allTasks = tasks.getAllTasks(); PrintStream out = new PrintStream(logFile); if (allTasks != null) { // Tasks were local: verify checksums: // indexState.setDocIDToID(); final Map<Task, Task> tasksSeen = new HashMap<Task, Task>(); out.println("\nResults for " + allTasks.size() + " tasks:"); boolean fail = false; for (final Task task : allTasks) { if (verifyCheckSum) { final Task other = tasksSeen.get(task); if (other != null) { if (task.checksum() != other.checksum()) { System.out.println("\nTASK:"); task.printResults(System.out, indexState); System.out.println("\nOTHER TASK:"); other.printResults(System.out, indexState); fail = true; //throw new RuntimeException("task " + task + " hit different checksums: " + task.checksum() + " vs " + other.checksum() + " other=" + other); } } else { tasksSeen.put(task, task); } } out.println("\nTASK: " + task); out.println(" " + (task.runTimeNanos / 1000000.0) + " msec"); out.println(" thread " + task.threadID); task.printResults(out, indexState); } if (fail) { throw new RuntimeException("some tasks got different results across different threads"); } allTasks.clear(); } mgr.close(); if (taxoReader != null) { taxoReader.close(); } if (writer != null) { // Don't actually commit any index changes: writer.rollback(); } dir.close(); if (printHeap) { // Try to get RAM usage -- some ideas poached from http://www.javaworld.com/javaworld/javatips/jw-javatip130.html final Runtime runtime = Runtime.getRuntime(); long usedMem1 = PerfUtils.usedMemory(runtime); long usedMem2 = Long.MAX_VALUE; for (int iter = 0; iter < 10; iter++) { runtime.runFinalization(); runtime.gc(); Thread.yield(); Thread.sleep(100); usedMem2 = usedMem1; usedMem1 = PerfUtils.usedMemory(runtime); } out.println("\nHEAP: " + PerfUtils.usedMemory(runtime)); } out.close(); }
From source file:pretraga.IsolationSimilarity.java
public List<String> searchByCategory(String searchingTerm, String category, boolean veryPrecision) { try {/* w ww . j ava 2s.com*/ Directory dir = FSDirectory.open(new File(indexDirectoryPath).toPath()); IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(new ClassicSimilarity()); QueryParser parser = new QueryParser(category, analyzer); String queryText = searchingTerm.toLowerCase(); if (!veryPrecision) queryText += "*"; Query q = parser.parse(queryText); TopScoreDocCollector collector = TopScoreDocCollector.create(10); searcher.search(q, collector); TopDocs docs = collector.topDocs(); List<String> ret = new ArrayList<>(); for (int i = 0; i < docs.totalHits; i++) { Document d = reader.document(docs.scoreDocs[i].doc); ret.add(d.get(category) + ", " + d.get(SIZE) + ", score: " + docs.scoreDocs[i].score); } reader.close(); dir.close(); return ret; } catch (Exception e) { System.err.println(e.toString()); return new ArrayList<>(); } }
From source file:retriever.TermStats.java
IndexSearcher buildTemporalIndexSearcher(IndexReader reader) throws Exception { IndexSearcher searcher = new IndexSearcher(reader); if (queryTranslation) //searcher.setSimilarity(new BM25Similarity()); //searcher.setSimilarity(new LMDirichletSimilarity()); searcher.setSimilarity(new LMJelinekMercerSimilarity(0.4f)); else//from w ww. j a v a 2 s . c o m searcher.setSimilarity(new BM25PayloadSimilarity(1.2f, 0.75f)); return searcher; }
From source file:reviewclassification.ReviewClassification.java
/** * Makes predictions for a test set. Saves them in a hash map that is later written in a file, in ascending file name order. Uses cos_score by default. * @param training_set The training set//from w w w. j av a 2 s .com * @param query_set The test set * @param threshold The threshold used for queries and predictions * @param filename The name of the file that holds the results * @throws org.apache.lucene.queryparser.classic.ParseException * @throws IOException */ public static void predictTestSet(ArrayList<Document> training_set, ArrayList<Document> query_set, int threshold, String filename) throws org.apache.lucene.queryparser.classic.ParseException, IOException { Similarity cos_sim = new ClassicSimilarity(); FileWriter outfile = new FileWriter(filename); HashMap<String, Boolean> predictions = new HashMap<>(); tlog(ft, "Bulding document index."); //Lucene stuff, building an analyzer, an index, creating a configuration, making an index writer, writing documents to the index, making a reader, and a searcher from the reader. //Setting cosine similarity as the similarity method in the configuration and the searcher. StandardAnalyzer analyzer = new StandardAnalyzer(); Directory index = new RAMDirectory(); IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setSimilarity(cos_sim); IndexWriter w = new IndexWriter(index, config); addDocuments(w, training_set); w.close(); IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(cos_sim); tlog(ft, "Done bulding index. Predicting the test set."); //For each review in the test set, query the index, get the results, and predict with a given threshold. //Then add the prediction to the hash map. The key is the name of the file. We only have the path, so we split it, get the filename, and remove the extension. for (Document doc : query_set) { ArrayList<Result> results = query(doc, analyzer, searcher, threshold); boolean cos_pred = predict(results, doc, threshold, COS_SCORE); String[] str = doc.get("path").split("/"); predictions.put(str[str.length - 1].split("\\.")[0], cos_pred); } //Sort files in file name ascending order tlog(ft, "Done predicting test set. Sorting files."); ArrayList<String> files = new ArrayList<>(predictions.keySet()); files.sort(new Comparator() { @Override public int compare(Object o1, Object o2) { String s1 = (String) o1; String s2 = (String) o2; return s1.compareTo(s2); } }); tlog(ft, "Done sorting files. Writing to disk."); //Write results to disk for (String s : files) { outfile.write(s + " " + boolToInt(predictions.get(s)) + System.lineSeparator()); } outfile.close(); tlog(ft, "Done writing to disk. Results in: " + filename); }
From source file:reviewclassification.ReviewClassification.java
/** * Uses 9/10 of the training set as the training set, and 1/10 as the test set, chosen randomly. * Makes predictions for all 3 scoring methods and for multiple thresholds, to decide the best scoring method and the best threshold to use. * @param documents The training set, which will be divided to create a test set * @param threshold_start The minimum threshold * @param threshold_end The maximum threshold * @param filename The name of the file that holds the results * @throws IOException/* w w w . ja v a 2 s . c o m*/ * @throws org.apache.lucene.queryparser.classic.ParseException */ public static void accuracyTest(ArrayList<Document> documents, int threshold_start, int threshold_end, String filename) throws IOException, org.apache.lucene.queryparser.classic.ParseException { long seed = System.nanoTime(); Collections.shuffle(documents, new Random(seed)); FileWriter outfile = new FileWriter(filename); //9/10 of the training set is used for training //The remaining 1/10 is used for testing ArrayList<Document> training_set = new ArrayList<>(documents.subList(0, documents.size() * 9 / 10)); ArrayList<Document> test_set = new ArrayList<>( documents.subList(documents.size() * 9 / 10, documents.size())); //Metrics objects hold tp, fp, tn, and fn counters. We keep one for each threshold. We are testing with 3 scoring methods, so we need 3 lists of objects, //which contain an object for each threshold. ArrayList<Integer> threshold_list = new ArrayList<>(); ArrayList<Metrics> metrics_list_knn = new ArrayList<>(); ArrayList<Metrics> metrics_list_knn_sentiment = new ArrayList<>(); ArrayList<Metrics> metrics_list_cos_score = new ArrayList<>(); //Initializing the metrics objects. for (int i = threshold_start; i <= threshold_end; i++) { threshold_list.add(i); metrics_list_knn.add(new Metrics()); metrics_list_knn_sentiment.add(new Metrics()); metrics_list_cos_score.add(new Metrics()); } //Built-in cosine similarity method. Similarity cos_sim = new ClassicSimilarity(); tlog(ft, "Bulding document index."); //Lucene stuff, building an analyzer, an index, creating a configuration, making an index writer, writing documents to the index, making a reader, and a searcher from the reader. //Setting cosine similarity as the similarity method in the configuration and the searcher. StandardAnalyzer analyzer = new StandardAnalyzer(); Directory index = new RAMDirectory(); IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setSimilarity(cos_sim); IndexWriter w = new IndexWriter(index, config); addDocuments(w, training_set); w.close(); IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(cos_sim); tlog(ft, "Done bulding index. Predicting the test set."); //For each review in the test set, query the index, get the results, then predict with a given threshold. //Testing for multiple thresholds to find which one to use. for (Document doc : test_set) { ArrayList<Result> results = query(doc, analyzer, searcher, threshold_list.get(threshold_list.size() - 1)); boolean query_class = doc.get("path").contains("pos"); //We execute the query only once, then for each threshold count the results with the appropriate metrics object. for (int i = 0; i < threshold_list.size(); i++) { boolean knn_pred = predict(results, doc, threshold_list.get(i), KNN); boolean knn_senti_pred = predict(results, doc, threshold_list.get(i), KNN_SENTIMENT); boolean cos_pred = predict(results, doc, threshold_list.get(i), COS_SCORE); update_metrics(metrics_list_knn.get(i), query_class, knn_pred); update_metrics(metrics_list_knn_sentiment.get(i), query_class, knn_senti_pred); update_metrics(metrics_list_cos_score.get(i), query_class, cos_pred); } } tlog(ft, "Done predicting test set. Calculating accuracies and writing to file."); //For each metrics object we call calculate(), which calculates the accuracy, then write it to file. for (int i = 0; i < threshold_list.size(); i++) { metrics_list_knn.get(i).calculate(); metrics_list_knn_sentiment.get(i).calculate(); metrics_list_cos_score.get(i).calculate(); outfile.write(threshold_list.get(i) + " " + metrics_list_knn.get(i).getAccuracy() + " " + metrics_list_knn_sentiment.get(i).getAccuracy() + " " + metrics_list_cos_score.get(i).getAccuracy() + System.lineSeparator()); } outfile.close(); tlog(ft, "Done writing to file. Results in: " + filename); }
From source file:searcher.CollStat.java
IndexSearcher initSearcher(IndexReader reader) throws Exception { IndexSearcher searcher = new IndexSearcher(reader); float lambda = Float.parseFloat(prop.getProperty("lm.lambda", "0.6")); searcher.setSimilarity(new LMJelinekMercerSimilarity(lambda)); return searcher; }