List of usage examples for org.apache.lucene.search.similarities BM25Similarity BM25Similarity
public BM25Similarity(float k1, float b)
From source file:apps.LuceneIndexer.java
License:Apache License
public static void main(String[] args) { Options options = new Options(); options.addOption("i", null, true, "input file"); options.addOption("o", null, true, "output directory"); options.addOption("r", null, true, "optional output TREC-format QREL file"); options.addOption("bm25_b", null, true, "BM25 parameter: b"); options.addOption("bm25_k1", null, true, "BM25 parameter: k1"); options.addOption("bm25fixed", null, false, "use the fixed BM25 similarity"); Joiner commaJoin = Joiner.on(','); Joiner spaceJoin = Joiner.on(' '); options.addOption("source_type", null, true, "document source type: " + commaJoin.join(SourceFactory.getDocSourceList())); // If you increase this value, you may need to modify the following line in *.sh file // export MAVEN_OPTS="-Xms8192m -server" double ramBufferSizeMB = 1024 * 8; // 8 GB CommandLineParser parser = new org.apache.commons.cli.GnuParser(); IndexWriter indexWriter = null;// w w w.ja v a2 s .c om BufferedWriter qrelWriter = null; int docNum = 0; try { CommandLine cmd = parser.parse(options, args); String inputFileName = null, outputDirName = null, qrelFileName = null; if (cmd.hasOption("i")) { inputFileName = cmd.getOptionValue("i"); } else { Usage("Specify 'input file'", options); } if (cmd.hasOption("o")) { outputDirName = cmd.getOptionValue("o"); } else { Usage("Specify 'index directory'", options); } if (cmd.hasOption("r")) { qrelFileName = cmd.getOptionValue("r"); } String sourceName = cmd.getOptionValue("source_type"); if (sourceName == null) Usage("Specify document source type", options); if (qrelFileName != null) qrelWriter = new BufferedWriter(new FileWriter(qrelFileName)); File outputDir = new File(outputDirName); if (!outputDir.exists()) { if (!outputDir.mkdirs()) { System.out.println("couldn't create " + outputDir.getAbsolutePath()); System.exit(1); } } if (!outputDir.isDirectory()) { System.out.println(outputDir.getAbsolutePath() + " is not a directory!"); System.exit(1); } if (!outputDir.canWrite()) { System.out.println("Can't write to " + outputDir.getAbsolutePath()); System.exit(1); } boolean useFixedBM25 = cmd.hasOption("bm25fixed"); float bm25_k1 = UtilConst.BM25_K1_DEFAULT, bm25_b = UtilConst.BM25_B_DEFAULT; if (cmd.hasOption("bm25_k1")) { try { bm25_k1 = Float.parseFloat(cmd.getOptionValue("bm25_k1")); } catch (NumberFormatException e) { Usage("Wrong format for 'bm25_k1'", options); } } if (cmd.hasOption("bm25_b")) { try { bm25_b = Float.parseFloat(cmd.getOptionValue("bm25_b")); } catch (NumberFormatException e) { Usage("Wrong format for 'bm25_b'", options); } } EnglishAnalyzer analyzer = new EnglishAnalyzer(); FSDirectory indexDir = FSDirectory.open(Paths.get(outputDirName)); IndexWriterConfig indexConf = new IndexWriterConfig(analyzer); /* OpenMode.CREATE creates a new index or overwrites an existing one. https://lucene.apache.org/core/6_0_0/core/org/apache/lucene/index/IndexWriterConfig.OpenMode.html#CREATE */ indexConf.setOpenMode(OpenMode.CREATE); indexConf.setRAMBufferSizeMB(ramBufferSizeMB); System.out.println(String.format("BM25 parameters k1=%f b=%f ", bm25_k1, bm25_b)); if (useFixedBM25) { System.out.println(String.format("Using fixed BM25Simlarity, k1=%f b=%f", bm25_k1, bm25_b)); indexConf.setSimilarity(new BM25SimilarityFix(bm25_k1, bm25_b)); } else { System.out.println(String.format("Using Lucene BM25Similarity, k1=%f b=%f", bm25_k1, bm25_b)); indexConf.setSimilarity(new BM25Similarity(bm25_k1, bm25_b)); } indexWriter = new IndexWriter(indexDir, indexConf); DocumentSource inpDocSource = SourceFactory.createDocumentSource(sourceName, inputFileName); DocumentEntry inpDoc = null; TextCleaner textCleaner = new TextCleaner(null); while ((inpDoc = inpDocSource.next()) != null) { ++docNum; Document luceneDoc = new Document(); ArrayList<String> cleanedToks = textCleaner.cleanUp(inpDoc.mDocText); String cleanText = spaceJoin.join(cleanedToks); // System.out.println(inpDoc.mDocId); // System.out.println(cleanText); // System.out.println("=============================="); luceneDoc.add(new StringField(UtilConst.FIELD_ID, inpDoc.mDocId, Field.Store.YES)); luceneDoc.add(new TextField(UtilConst.FIELD_TEXT, cleanText, Field.Store.YES)); indexWriter.addDocument(luceneDoc); if (inpDoc.mIsRel != null && qrelWriter != null) { saveQrelOneEntry(qrelWriter, inpDoc.mQueryId, inpDoc.mDocId, inpDoc.mIsRel ? MAX_GRADE : 0); } if (docNum % 1000 == 0) System.out.println(String.format("Indexed %d documents", docNum)); } } catch (ParseException e) { e.printStackTrace(); Usage("Cannot parse arguments" + e, options); } catch (Exception e) { System.err.println("Terminating due to an exception: " + e); System.exit(1); } finally { System.out.println(String.format("Indexed %d documents", docNum)); try { if (null != indexWriter) indexWriter.close(); if (null != qrelWriter) qrelWriter.close(); } catch (IOException e) { System.err.println("IO exception: " + e); e.printStackTrace(); } } }
From source file:apps.LuceneQuery.java
License:Apache License
public static void main(String[] args) { Options options = new Options(); options.addOption("d", null, true, "index directory"); options.addOption("i", null, true, "input file"); options.addOption("s", null, true, "stop word file"); options.addOption("n", null, true, "max # of results"); options.addOption("o", null, true, "a TREC-style output file"); options.addOption("r", null, true, "an optional QREL file, if specified," + "we save results only for queries for which we find at least one relevant entry."); options.addOption("prob", null, true, "question sampling probability"); options.addOption("max_query_qty", null, true, "a maximum number of queries to run"); options.addOption("bm25_b", null, true, "BM25 parameter: b"); options.addOption("bm25_k1", null, true, "BM25 parameter: k1"); options.addOption("bm25fixed", null, false, "use the fixed BM25 similarity"); options.addOption("seed", null, true, "random seed"); Joiner commaJoin = Joiner.on(','); Joiner spaceJoin = Joiner.on(' '); options.addOption("source_type", null, true, "query source type: " + commaJoin.join(SourceFactory.getQuerySourceList())); CommandLineParser parser = new org.apache.commons.cli.GnuParser(); QrelReader qrels = null;/*ww w . ja va 2 s . co m*/ try { CommandLine cmd = parser.parse(options, args); String indexDir = null; if (cmd.hasOption("d")) { indexDir = cmd.getOptionValue("d"); } else { Usage("Specify 'index directory'", options); } String inputFileName = null; if (cmd.hasOption("i")) { inputFileName = cmd.getOptionValue("i"); } else { Usage("Specify 'input file'", options); } DictNoComments stopWords = null; if (cmd.hasOption("s")) { String stopWordFileName = cmd.getOptionValue("s"); stopWords = new DictNoComments(new File(stopWordFileName), true /* lowercasing */); System.out.println("Using the stopword file: " + stopWordFileName); } String sourceName = cmd.getOptionValue("source_type"); if (sourceName == null) Usage("Specify document source type", options); int numRet = 100; if (cmd.hasOption("n")) { numRet = Integer.parseInt(cmd.getOptionValue("n")); System.out.println("Retrieving at most " + numRet + " candidate entries."); } String trecOutFileName = null; if (cmd.hasOption("o")) { trecOutFileName = cmd.getOptionValue("o"); } else { Usage("Specify 'a TREC-style output file'", options); } double fProb = 1.0f; if (cmd.hasOption("prob")) { try { fProb = Double.parseDouble(cmd.getOptionValue("prob")); } catch (NumberFormatException e) { Usage("Wrong format for 'question sampling probability'", options); } } if (fProb <= 0 || fProb > 1) { Usage("Question sampling probability should be >0 and <=1", options); } System.out.println("Sample the following fraction of questions: " + fProb); float bm25_k1 = UtilConst.BM25_K1_DEFAULT, bm25_b = UtilConst.BM25_B_DEFAULT; if (cmd.hasOption("bm25_k1")) { try { bm25_k1 = Float.parseFloat(cmd.getOptionValue("bm25_k1")); } catch (NumberFormatException e) { Usage("Wrong format for 'bm25_k1'", options); } } if (cmd.hasOption("bm25_b")) { try { bm25_b = Float.parseFloat(cmd.getOptionValue("bm25_b")); } catch (NumberFormatException e) { Usage("Wrong format for 'bm25_b'", options); } } long seed = 0; String tmpl = cmd.getOptionValue("seed"); if (tmpl != null) seed = Long.parseLong(tmpl); System.out.println("Using seed: " + seed); Random randGen = new Random(seed); System.out.println(String.format("BM25 parameters k1=%f b=%f ", bm25_k1, bm25_b)); boolean useFixedBM25 = cmd.hasOption("bm25fixed"); EnglishAnalyzer analyzer = new EnglishAnalyzer(); Similarity similarity = null; if (useFixedBM25) { System.out.println(String.format("Using fixed BM25Simlarity, k1=%f b=%f", bm25_k1, bm25_b)); similarity = new BM25SimilarityFix(bm25_k1, bm25_b); } else { System.out.println(String.format("Using Lucene BM25Similarity, k1=%f b=%f", bm25_k1, bm25_b)); similarity = new BM25Similarity(bm25_k1, bm25_b); } int maxQueryQty = Integer.MAX_VALUE; if (cmd.hasOption("max_query_qty")) { try { maxQueryQty = Integer.parseInt(cmd.getOptionValue("max_query_qty")); } catch (NumberFormatException e) { Usage("Wrong format for 'max_query_qty'", options); } } System.out.println(String.format("Executing at most %d queries", maxQueryQty)); if (cmd.hasOption("r")) { String qrelFile = cmd.getOptionValue("r"); System.out.println("Using the qrel file: '" + qrelFile + "', queries not returning a relevant entry will be ignored."); qrels = new QrelReader(qrelFile); } System.out.println(String.format("Using indexing directory %s", indexDir)); LuceneCandidateProvider candProvider = new LuceneCandidateProvider(indexDir, analyzer, similarity); TextCleaner textCleaner = new TextCleaner(stopWords); QuerySource inpQuerySource = SourceFactory.createQuerySource(sourceName, inputFileName); QueryEntry inpQuery = null; BufferedWriter trecOutFile = new BufferedWriter(new FileWriter(new File(trecOutFileName))); int questNum = 0, questQty = 0; long totalTimeMS = 0; while ((inpQuery = inpQuerySource.next()) != null) { if (questQty >= maxQueryQty) break; ++questNum; String queryID = inpQuery.mQueryId; if (randGen.nextDouble() <= fProb) { ++questQty; String tokQuery = spaceJoin.join(textCleaner.cleanUp(inpQuery.mQueryText)); String query = TextCleaner.luceneSafeCleanUp(tokQuery).trim(); ResEntry[] results = null; if (query.isEmpty()) { results = new ResEntry[0]; System.out.println(String.format("WARNING, empty query id = '%s'", inpQuery.mQueryId)); } else { try { long start = System.currentTimeMillis(); results = candProvider.getCandidates(questNum, query, numRet); long end = System.currentTimeMillis(); long searchTimeMS = end - start; totalTimeMS += searchTimeMS; System.out.println(String.format( "Obtained results for the query # %d (answered %d queries), queryID %s the search took %d ms, we asked for max %d entries got %d", questNum, questQty, queryID, searchTimeMS, numRet, results.length)); } catch (ParseException e) { e.printStackTrace(); System.err.println( "Error parsing query: " + query + " orig question is :" + inpQuery.mQueryText); System.exit(1); } } boolean bSave = true; if (qrels != null) { boolean bOk = false; for (ResEntry r : results) { String label = qrels.get(queryID, r.mDocId); if (candProvider.isRelevLabel(label, 1)) { bOk = true; break; } } if (!bOk) bSave = false; } // System.out.println(String.format("Ranking results the query # %d queryId='%s' save results? %b", // questNum, queryID, bSave)); if (bSave) { saveTrecResults(queryID, results, trecOutFile, TREC_RUN, numRet); } } if (questNum % 1000 == 0) System.out.println(String.format("Proccessed %d questions", questNum)); } System.out.println(String.format("Proccessed %d questions, the search took %f MS on average", questQty, (float) totalTimeMS / questQty)); trecOutFile.close(); } catch (ParseException e) { e.printStackTrace(); Usage("Cannot parse arguments: " + e, options); } catch (Exception e) { System.err.println("Terminating due to an exception: " + e); System.exit(1); } }
From source file:io.anserini.qa.RetrieveSentences.java
License:Apache License
public Map<String, Float> search(SortedMap<Integer, String> topics, int numHits) throws IOException, ParseException { IndexSearcher searcher = new IndexSearcher(reader); //using BM25 scoring model Similarity similarity = new BM25Similarity(0.9f, 0.4f); searcher.setSimilarity(similarity);/* w ww . j a v a 2 s . c om*/ EnglishAnalyzer ea = new EnglishAnalyzer(); QueryParser queryParser = new QueryParser(FIELD_BODY, ea); queryParser.setDefaultOperator(QueryParser.Operator.OR); Map<String, Float> scoredDocs = new LinkedHashMap<>(); for (Map.Entry<Integer, String> entry : topics.entrySet()) { int qID = entry.getKey(); String queryString = entry.getValue(); Query query = AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString); TopDocs rs = searcher.search(query, numHits); ScoreDoc[] hits = rs.scoreDocs; ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher); for (int i = 0; i < docs.documents.length; i++) { scoredDocs.put(docs.documents[i].getField(FIELD_ID).stringValue(), docs.scores[i]); } } return scoredDocs; }
From source file:io.anserini.search.SearchCollection.java
License:Apache License
public SearchCollection(SearchArgs args) throws IOException { this.args = args; Path indexPath = Paths.get(args.index); if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) { throw new IllegalArgumentException(args.index + " does not exist or is not a directory."); }/*from ww w . j ava2 s. c o m*/ LOG.info("Reading index at " + args.index); this.reader = DirectoryReader.open(FSDirectory.open(indexPath)); // Figure out which scoring model to use. if (args.ql) { LOG.info("Using QL scoring model"); this.similarity = new LMDirichletSimilarity(args.mu); } else if (args.bm25) { LOG.info("Using BM25 scoring model"); this.similarity = new BM25Similarity(args.k1, args.b); } else if (args.f2log) { LOG.info("Using F2Log scoring model"); this.similarity = new F2LogSimilarity(args.f2log_s); } else { throw new IllegalArgumentException("Error: Must specify scoring model!"); } // Are we searching tweets? if (args.searchtweets) { analyzer = new TweetAnalyzer(); } else { analyzer = args.keepstop ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer(); } isRerank = args.rm3 || args.axiom; // Set up the ranking cascade. cascade = new RerankerCascade(); if (args.rm3) { cascade.add(new Rm3Reranker(analyzer, FIELD_BODY, args)); } else if (args.axiom) { cascade.add(new AxiomReranker(FIELD_BODY, args)); } cascade.add(new ScoreTiesAdjusterReranker()); }
From source file:io.anserini.search.SearchTweets.java
License:Apache License
public static void main(String[] args) throws Exception { long curTime = System.nanoTime(); SearchArgs searchArgs = new SearchArgs(); CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90)); try {// www . j a v a2s. c o m parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); parser.printUsage(System.err); System.err.println("Example: SearchTweets" + parser.printExample(OptionHandlerFilter.REQUIRED)); return; } LOG.info("Reading index at " + searchArgs.index); Directory dir; if (searchArgs.inmem) { LOG.info("Using MMapDirectory with preload"); dir = new MMapDirectory(Paths.get(searchArgs.index)); ((MMapDirectory) dir).setPreload(true); } else { LOG.info("Using default FSDirectory"); dir = FSDirectory.open(Paths.get(searchArgs.index)); } IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); if (searchArgs.ql) { LOG.info("Using QL scoring model"); searcher.setSimilarity(new LMDirichletSimilarity(searchArgs.mu)); } else if (searchArgs.bm25) { LOG.info("Using BM25 scoring model"); searcher.setSimilarity(new BM25Similarity(searchArgs.k1, searchArgs.b)); } else { LOG.error("Error: Must specify scoring model!"); System.exit(-1); } RerankerCascade cascade = new RerankerCascade(); if (searchArgs.rm3) { cascade.add(new Rm3Reranker(IndexTweets.ANALYZER, StatusField.TEXT.name, "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.twitter.txt")); cascade.add(new RemoveRetweetsTemporalTiebreakReranker()); } else { cascade.add(new RemoveRetweetsTemporalTiebreakReranker()); } MicroblogTopicSet topics = MicroblogTopicSet.fromFile(new File(searchArgs.topics)); PrintStream out = new PrintStream(new FileOutputStream(new File(searchArgs.output))); LOG.info("Writing output to " + searchArgs.output); LOG.info("Initialized complete! (elapsed time = " + (System.nanoTime() - curTime) / 1000000 + "ms)"); long totalTime = 0; int cnt = 0; for (MicroblogTopic topic : topics) { long curQueryTime = System.nanoTime(); Filter filter = NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, topic.getQueryTweetTime(), true, true); Query query = AnalyzerUtils.buildBagOfWordsQuery(StatusField.TEXT.name, IndexTweets.ANALYZER, topic.getQuery()); TopDocs rs = searcher.search(query, filter, searchArgs.hits); RerankerContext context = new RerankerContext(searcher, query, topic.getId(), topic.getQuery(), Sets.newHashSet(AnalyzerUtils.tokenize(IndexTweets.ANALYZER, topic.getQuery())), filter); ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context); for (int i = 0; i < docs.documents.length; i++) { String qid = topic.getId().replaceFirst("^MB0*", ""); out.println(String.format("%s Q0 %s %d %f %s", qid, docs.documents[i].getField(StatusField.ID.name).numericValue(), (i + 1), docs.scores[i], searchArgs.runtag)); } long qtime = (System.nanoTime() - curQueryTime) / 1000000; LOG.info("Query " + topic.getId() + " (elapsed time = " + qtime + "ms)"); totalTime += qtime; cnt++; } LOG.info("All queries completed!"); LOG.info("Total elapsed time = " + totalTime + "ms"); LOG.info("Average query latency = " + (totalTime / cnt) + "ms"); reader.close(); out.close(); }
From source file:io.anserini.search.SearchWebCollection.java
License:Apache License
public static void main(String[] args) throws IOException, ParseException { long curTime = System.nanoTime(); SearchArgs searchArgs = new SearchArgs(); CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90)); try {//from ww w . j a va 2 s .c o m parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); parser.printUsage(System.err); System.err.println("Example: SearchWebCollection" + parser.printExample(OptionHandlerFilter.REQUIRED)); return; } LOG.info("Reading index at " + searchArgs.index); Directory dir; if (searchArgs.inmem) { LOG.info("Using MMapDirectory with preload"); dir = new MMapDirectory(Paths.get(searchArgs.index)); ((MMapDirectory) dir).setPreload(true); } else { LOG.info("Using default FSDirectory"); dir = FSDirectory.open(Paths.get(searchArgs.index)); } Similarity similarity = null; if (searchArgs.ql) { LOG.info("Using QL scoring model"); similarity = new LMDirichletSimilarity(searchArgs.mu); } else if (searchArgs.bm25) { LOG.info("Using BM25 scoring model"); similarity = new BM25Similarity(searchArgs.k1, searchArgs.b); } else { LOG.error("Error: Must specify scoring model!"); System.exit(-1); } RerankerCascade cascade = new RerankerCascade(); if (searchArgs.rm3) { cascade.add(new Rm3Reranker(new EnglishAnalyzer(), "body", "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.gov2.txt")); } else { cascade.add(new IdentityReranker()); } Path topicsFile = Paths.get(searchArgs.topics); if (!Files.exists(topicsFile) || !Files.isRegularFile(topicsFile) || !Files.isReadable(topicsFile)) { throw new IllegalArgumentException( "Topics file : " + topicsFile + " does not exist or is not a (readable) file."); } SortedMap<Integer, String> topics = io.anserini.document.Collection.GOV2.equals(searchArgs.collection) ? readTeraByteTackQueries(topicsFile) : readWebTrackQueries(topicsFile); SearchWebCollection searcher = new SearchWebCollection(searchArgs.index); searcher.search(topics, searchArgs.output, similarity, searchArgs.hits); searcher.close(); }
From source file:io.anserini.util.SearchTimeUtil.java
License:Apache License
public static void main(String[] args) throws IOException, ParseException { if (args.length != 1) { System.err.println("Usage: SearchTimeUtil <indexDir>"); System.err.println("indexDir: index directory"); System.exit(1);/*from w w w. j a v a 2s . c o m*/ } String[] topics = { "topics.web.1-50.txt", "topics.web.51-100.txt", "topics.web.101-150.txt", "topics.web.151-200.txt", "topics.web.201-250.txt", "topics.web.251-300.txt" }; SearchWebCollection searcher = new SearchWebCollection(args[0]); for (String topicFile : topics) { Path topicsFile = Paths.get("src/resources/topics-and-qrels/", topicFile); SortedMap<Integer, String> queries = SearchWebCollection.readWebTrackQueries(topicsFile); for (int i = 1; i <= 3; i++) { final long start = System.nanoTime(); String submissionFile = File.createTempFile(topicFile + "_" + i, ".tmp").getAbsolutePath(); searcher.search(queries, submissionFile, new BM25Similarity(0.9f, 0.4f), 1000); final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); System.out.println(topicFile + "_" + i + " search completed in " + DurationFormatUtils.formatDuration(durationMillis, "mm:ss:SSS")); } } searcher.close(); }
From source file:it.unipd.dei.ims.lucene.clef.applications.BatchRetrieval.java
License:Apache License
public static void main(String[] args) { Properties properties = new Properties(); InputStream input = null;//www . j av a 2 s .c o m try { if (System.getProperty("properties.path") != null) { input = new FileInputStream(System.getProperty("properties.path")); properties.load(input); } else { logger.info("Loading default property file [resources/lucene-clef.properties]"); ClassLoader loader = Thread.currentThread().getContextClassLoader(); input = loader.getResourceAsStream("lucene-clef.properties"); properties.load(input); } } catch (IOException ex) { ex.printStackTrace(); } finally { if (input != null) { try { input.close(); } catch (IOException e) { e.printStackTrace(); } } } properties.putAll(System.getProperties()); Path indexPath = new File(properties.getProperty("index.path")).toPath(); Path runPath = new File(properties.getProperty("run.path")).toPath(); String runTag = properties.getProperty("run.tag"); String language = properties.getProperty("language"); String stemmer = properties.getProperty("stemmer"); String stopsetType = properties.getProperty("stopset.type"); String stopsetPath = properties.getProperty("stopset.path"); try { Directory directory = new SimpleFSDirectory(indexPath); IndexReader reader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); String model = properties.getProperty("run.model").toUpperCase(); Similarity similarity; switch (model) { case "BM25": similarity = new BM25Similarity(Float.parseFloat(properties.getProperty("bm25.k1", "1.2f")), Float.parseFloat(properties.getProperty("bm25.b", "0.75f"))); break; default: throw new UnsupportedOperationException("Model " + model + " not supported yet"); } searcher.setSimilarity(similarity); int maxResults = Integer.parseInt(properties.getProperty("maxresults", "1000")); SubmissionReport runfile = new SubmissionReport( new PrintWriter(Files.newBufferedWriter(runPath, StandardCharsets.UTF_8)), model); String topicPath = properties.getProperty("topics.path"); String[] topicFields = properties.getProperty("topics.fields").split(";"); CharArraySet stopset = AnalyzerFactory.createStopset(language, stopsetType, stopsetPath); QualityQuery qqs[] = getQualityQueries(topicPath, topicFields); QualityQueryParser qqParser = new ClefQQParser(topicFields, BuildIndex.BODY_FIELD_NAME, language, stemmer, stopset); // run the benchmark QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, BuildIndex.ID_FIELD_NAME); qrun.setMaxResults(maxResults); QualityStats stats[] = qrun.execute(null, runfile, null); reader.close(); directory.close(); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } }
From source file:NewsIR_search.NewsIRSearcher.java
private void setSimilarityFn_ResFileName(int indexcounter) throws IOException { searcher = new IndexSearcher(reader); float bm25_k1, bm25_b, lm_jm_lambda, lm_d_mu; switch (setSimilarityFlag.toLowerCase()) { case "bm25": bm25_k1 = Float.parseFloat(prop.getProperty("bm25-k1")); bm25_b = Float.parseFloat(prop.getProperty("bm25-b")); System.out.println("Setting BM25 with k1: " + bm25_k1 + " b: " + bm25_b); searcher.setSimilarity(new BM25Similarity(bm25_k1, bm25_b)); run_name = "bm25-k1=" + bm25_k1 + "-b=" + bm25_b + "-" + num_ret; break;//from w w w . j av a 2 s .c om case "lm-jm": lm_jm_lambda = Float.parseFloat(prop.getProperty("lm_jm_lambda")); System.out.println("Setting LMJelinekMercer with lambda: " + lm_jm_lambda); searcher.setSimilarity(new LMJelinekMercerSimilarity(lm_jm_lambda)); run_name = "lm-jm-lambda" + lm_jm_lambda; break; case "lm-d": lm_d_mu = Float.parseFloat(prop.getProperty("lm_d_mu")); System.out.println("Setting LMDirichlet with mu: " + lm_d_mu); searcher.setSimilarity(new LMDirichletSimilarity(lm_d_mu)); run_name = "lm-d-mu=" + lm_d_mu + "-"; break; case "default": System.out.println("Setting DefaultSimilarity of Lucene: "); searcher.setSimilarity(new DefaultSimilarity()); run_name = "default-lucene"; break; default: System.out.println("Setting DefaultSimilarity of Lucene: "); searcher.setSimilarity(new DefaultSimilarity()); run_name = "default-lucene"; break; } resultsFile = run_name; }
From source file:nl.uva.DataHandler.Indexing.java
@Override public void Indexer(String indexPathString) throws Exception, Throwable { try {//from w w w .j av a2s . com log.info( "----------------------- INDEXING - Override Version: for BM25 :) --------------------------"); Path ipath = FileSystems.getDefault().getPath(indexPathString); super.IndexesCleaner(indexPathString); MyAnalyzer myAnalyzer; if (!super.stopWordsRemoving) myAnalyzer = new MyAnalyzer(super.stemming); else myAnalyzer = new MyAnalyzer(super.stemming, super.LoadStopwords()); Analyzer analyzer = myAnalyzer.getAnalyzer(configFile.getProperty("CORPUS_LANGUAGE")); PerFieldAnalyzerWrapper prfWrapper = new PerFieldAnalyzerWrapper(analyzer, super.analyzerMap); IndexWriterConfig irc = new IndexWriterConfig(prfWrapper); irc.setSimilarity(new BM25Similarity(1.2F, 0.75F)); this.writer = new IndexWriter(new SimpleFSDirectory(ipath), irc); this.docIndexer(); this.writer.commit(); this.writer.close(); analyzer.close(); prfWrapper.close(); log.info("-------------------------------------------------"); log.info("Index is created successfully..."); log.info("-------------------------------------------------"); } catch (Exception ex) { log.error(ex); throw ex; } }