List of usage examples for org.apache.lucene.search.similarities LMDirichletSimilarity LMDirichletSimilarity
public LMDirichletSimilarity(CollectionModel collectionModel)
From source file:cc.twittertools.search.api.TrecSearchHandler.java
License:Apache License
public TrecSearchHandler(File indexPath, @Nullable Map<String, String> credentials) throws IOException { Preconditions.checkNotNull(indexPath); Preconditions.checkArgument(indexPath.exists()); // Can be null, in which case we don't check for credentials. this.credentials = credentials; IndexReader reader = DirectoryReader.open(FSDirectory.open(indexPath)); searcher = new IndexSearcher(reader); searcher.setSimilarity(new LMDirichletSimilarity(2500.0f)); }
From source file:cc.twittertools.search.api.TrecSearchThriftServer.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(HELP_OPTION, "show help")); options.addOption(OptionBuilder.withArgName("port").hasArg().withDescription("port").create(PORT_OPTION)); options.addOption(/* w ww . j a v a2 s .co m*/ OptionBuilder.withArgName("index").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg() .withDescription("max number of threads in thread pool").create(MAX_THREADS_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg() .withDescription("file containing access tokens").create(CREDENTIALS_OPTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(TrecSearchThriftServer.class.getName(), options); System.exit(-1); } int port = cmdline.hasOption(PORT_OPTION) ? Integer.parseInt(cmdline.getOptionValue(PORT_OPTION)) : DEFAULT_PORT; int maxThreads = cmdline.hasOption(MAX_THREADS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MAX_THREADS_OPTION)) : DEFAULT_MAX_THREADS; File index = new File(cmdline.getOptionValue(INDEX_OPTION)); Map<String, String> credentials = null; if (cmdline.hasOption(CREDENTIALS_OPTION)) { credentials = Maps.newHashMap(); File cfile = new File(cmdline.getOptionValue(CREDENTIALS_OPTION)); if (!cfile.exists()) { System.err.println("Error: " + cfile + " does not exist!"); System.exit(-1); } for (String s : Files.readLines(cfile, Charsets.UTF_8)) { try { String[] arr = s.split(":"); credentials.put(arr[0], arr[1]); } catch (Exception e) { // Catch any exceptions from parsing file contain access tokens System.err.println("Error reading access tokens from " + cfile + "!"); System.exit(-1); } } } if (!index.exists()) { System.err.println("Error: " + index + " does not exist!"); System.exit(-1); } IndexReader reader = DirectoryReader.open(MMapDirectory.open(index)); IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(new LMDirichletSimilarity(DEFAULT_MU)); QueryLikelihoodModel qlModel = new QueryLikelihoodModel(DEFAULT_MU); TServerSocket serverSocket = new TServerSocket(port); TrecSearch.Processor<TrecSearch.Iface> searchProcessor = new TrecSearch.Processor<TrecSearch.Iface>( new TrecSearchHandler(searcher, qlModel, credentials)); TThreadPoolServer.Args serverArgs = new TThreadPoolServer.Args(serverSocket); serverArgs.maxWorkerThreads(maxThreads); TServer thriftServer = new TThreadPoolServer( serverArgs.processor(searchProcessor).protocolFactory(new TBinaryProtocol.Factory())); thriftServer.serve(); }
From source file:cc.twittertools.search.local.RunQueries.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(//from w ww . jav a 2 s .c o m OptionBuilder.withArgName("path").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of results to return") .create(NUM_RESULTS_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg() .withDescription("file containing topics in TREC format").create(QUERIES_OPTION)); options.addOption(OptionBuilder.withArgName("similarity").hasArg() .withDescription("similarity to use (BM25, LM)").create(SIMILARITY_OPTION)); options.addOption( OptionBuilder.withArgName("string").hasArg().withDescription("runtag").create(RUNTAG_OPTION)); options.addOption(new Option(VERBOSE_OPTION, "print out complete document")); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(QUERIES_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(RunQueries.class.getName(), options); System.exit(-1); } File indexLocation = new File(cmdline.getOptionValue(INDEX_OPTION)); if (!indexLocation.exists()) { System.err.println("Error: " + indexLocation + " does not exist!"); System.exit(-1); } String runtag = cmdline.hasOption(RUNTAG_OPTION) ? cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG; String topicsFile = cmdline.getOptionValue(QUERIES_OPTION); int numResults = 1000; try { if (cmdline.hasOption(NUM_RESULTS_OPTION)) { numResults = Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION)); } } catch (NumberFormatException e) { System.err.println("Invalid " + NUM_RESULTS_OPTION + ": " + cmdline.getOptionValue(NUM_RESULTS_OPTION)); System.exit(-1); } String similarity = "LM"; if (cmdline.hasOption(SIMILARITY_OPTION)) { similarity = cmdline.getOptionValue(SIMILARITY_OPTION); } boolean verbose = cmdline.hasOption(VERBOSE_OPTION); PrintStream out = new PrintStream(System.out, true, "UTF-8"); IndexReader reader = DirectoryReader.open(FSDirectory.open(indexLocation)); IndexSearcher searcher = new IndexSearcher(reader); if (similarity.equalsIgnoreCase("BM25")) { searcher.setSimilarity(new BM25Similarity()); } else if (similarity.equalsIgnoreCase("LM")) { searcher.setSimilarity(new LMDirichletSimilarity(2500.0f)); } QueryParser p = new QueryParser(Version.LUCENE_43, StatusField.TEXT.name, IndexStatuses.ANALYZER); TrecTopicSet topics = TrecTopicSet.fromFile(new File(topicsFile)); for (TrecTopic topic : topics) { Query query = p.parse(topic.getQuery()); Filter filter = NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, topic.getQueryTweetTime(), true, true); TopDocs rs = searcher.search(query, filter, numResults); int i = 1; for (ScoreDoc scoreDoc : rs.scoreDocs) { Document hit = searcher.doc(scoreDoc.doc); out.println(String.format("%s Q0 %s %d %f %s", topic.getId(), hit.getField(StatusField.ID.name).numericValue(), i, scoreDoc.score, runtag)); if (verbose) { out.println("# " + hit.toString().replaceAll("[\\n\\r]+", " ")); } i++; } } reader.close(); out.close(); }
From source file:cc.twittertools.search.local.SearchStatuses.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(//w w w. j ava 2 s. co m OptionBuilder.withArgName("path").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption( OptionBuilder.withArgName("string").hasArg().withDescription("query id").create(QID_OPTION)); options.addOption( OptionBuilder.withArgName("string").hasArg().withDescription("query text").create(QUERY_OPTION)); options.addOption( OptionBuilder.withArgName("string").hasArg().withDescription("runtag").create(RUNTAG_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("maxid").create(MAX_ID_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of results to return") .create(NUM_RESULTS_OPTION)); options.addOption(OptionBuilder.withArgName("similarity").hasArg() .withDescription("similarity to use (BM25, LM)").create(SIMILARITY_OPTION)); options.addOption(new Option(VERBOSE_OPTION, "print out complete document")); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(QUERY_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(SearchStatuses.class.getName(), options); System.exit(-1); } File indexLocation = new File(cmdline.getOptionValue(INDEX_OPTION)); if (!indexLocation.exists()) { System.err.println("Error: " + indexLocation + " does not exist!"); System.exit(-1); } String qid = cmdline.hasOption(QID_OPTION) ? cmdline.getOptionValue(QID_OPTION) : DEFAULT_QID; String queryText = cmdline.hasOption(QUERY_OPTION) ? cmdline.getOptionValue(QUERY_OPTION) : DEFAULT_Q; String runtag = cmdline.hasOption(RUNTAG_OPTION) ? cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG; long maxId = cmdline.hasOption(MAX_ID_OPTION) ? Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION)) : DEFAULT_MAX_ID; int numResults = cmdline.hasOption(NUM_RESULTS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION)) : DEFAULT_NUM_RESULTS; boolean verbose = cmdline.hasOption(VERBOSE_OPTION); String similarity = "LM"; if (cmdline.hasOption(SIMILARITY_OPTION)) { similarity = cmdline.getOptionValue(SIMILARITY_OPTION); } PrintStream out = new PrintStream(System.out, true, "UTF-8"); IndexReader reader = DirectoryReader.open(FSDirectory.open(indexLocation)); IndexSearcher searcher = new IndexSearcher(reader); if (similarity.equalsIgnoreCase("BM25")) { searcher.setSimilarity(new BM25Similarity()); } else if (similarity.equalsIgnoreCase("LM")) { searcher.setSimilarity(new LMDirichletSimilarity(2500.0f)); } QueryParser p = new QueryParser(Version.LUCENE_43, IndexStatuses.StatusField.TEXT.name, IndexStatuses.ANALYZER); Query query = p.parse(queryText); Filter filter = NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, maxId, true, true); TopDocs rs = searcher.search(query, filter, numResults); int i = 1; for (ScoreDoc scoreDoc : rs.scoreDocs) { Document hit = searcher.doc(scoreDoc.doc); out.println(String.format("%s Q0 %s %d %f %s", qid, hit.getField(StatusField.ID.name).numericValue(), i, scoreDoc.score, runtag)); if (verbose) { out.println("# " + hit.toString().replaceAll("[\\n\\r]+", " ")); } i++; } reader.close(); out.close(); }
From source file:cc.wikitools.lucene.WikipediaSearcher.java
License:Apache License
protected void init() { searcher = new IndexSearcher(reader); searcher.setSimilarity(new LMDirichletSimilarity(2500.0f)); parserArticle = new QueryParser(Version.LUCENE_43, IndexField.TEXT.name, IndexWikipediaDump.ANALYZER); parserTitle = new QueryParser(Version.LUCENE_43, IndexField.TITLE.name, IndexWikipediaDump.ANALYZER); }
From source file:io.anserini.search.SearchCollection.java
License:Apache License
public SearchCollection(SearchArgs args) throws IOException { this.args = args; Path indexPath = Paths.get(args.index); if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) { throw new IllegalArgumentException(args.index + " does not exist or is not a directory."); }/*from w ww . j a v a2 s . c o m*/ LOG.info("Reading index at " + args.index); this.reader = DirectoryReader.open(FSDirectory.open(indexPath)); // Figure out which scoring model to use. if (args.ql) { LOG.info("Using QL scoring model"); this.similarity = new LMDirichletSimilarity(args.mu); } else if (args.bm25) { LOG.info("Using BM25 scoring model"); this.similarity = new BM25Similarity(args.k1, args.b); } else if (args.f2log) { LOG.info("Using F2Log scoring model"); this.similarity = new F2LogSimilarity(args.f2log_s); } else { throw new IllegalArgumentException("Error: Must specify scoring model!"); } // Are we searching tweets? if (args.searchtweets) { analyzer = new TweetAnalyzer(); } else { analyzer = args.keepstop ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer(); } isRerank = args.rm3 || args.axiom; // Set up the ranking cascade. cascade = new RerankerCascade(); if (args.rm3) { cascade.add(new Rm3Reranker(analyzer, FIELD_BODY, args)); } else if (args.axiom) { cascade.add(new AxiomReranker(FIELD_BODY, args)); } cascade.add(new ScoreTiesAdjusterReranker()); }
From source file:io.anserini.search.SearchTweets.java
License:Apache License
public static void main(String[] args) throws Exception { long curTime = System.nanoTime(); SearchArgs searchArgs = new SearchArgs(); CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90)); try {//from w w w . j a v a 2 s .c om parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); parser.printUsage(System.err); System.err.println("Example: SearchTweets" + parser.printExample(OptionHandlerFilter.REQUIRED)); return; } LOG.info("Reading index at " + searchArgs.index); Directory dir; if (searchArgs.inmem) { LOG.info("Using MMapDirectory with preload"); dir = new MMapDirectory(Paths.get(searchArgs.index)); ((MMapDirectory) dir).setPreload(true); } else { LOG.info("Using default FSDirectory"); dir = FSDirectory.open(Paths.get(searchArgs.index)); } IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); if (searchArgs.ql) { LOG.info("Using QL scoring model"); searcher.setSimilarity(new LMDirichletSimilarity(searchArgs.mu)); } else if (searchArgs.bm25) { LOG.info("Using BM25 scoring model"); searcher.setSimilarity(new BM25Similarity(searchArgs.k1, searchArgs.b)); } else { LOG.error("Error: Must specify scoring model!"); System.exit(-1); } RerankerCascade cascade = new RerankerCascade(); if (searchArgs.rm3) { cascade.add(new Rm3Reranker(IndexTweets.ANALYZER, StatusField.TEXT.name, "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.twitter.txt")); cascade.add(new RemoveRetweetsTemporalTiebreakReranker()); } else { cascade.add(new RemoveRetweetsTemporalTiebreakReranker()); } MicroblogTopicSet topics = MicroblogTopicSet.fromFile(new File(searchArgs.topics)); PrintStream out = new PrintStream(new FileOutputStream(new File(searchArgs.output))); LOG.info("Writing output to " + searchArgs.output); LOG.info("Initialized complete! (elapsed time = " + (System.nanoTime() - curTime) / 1000000 + "ms)"); long totalTime = 0; int cnt = 0; for (MicroblogTopic topic : topics) { long curQueryTime = System.nanoTime(); Filter filter = NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, topic.getQueryTweetTime(), true, true); Query query = AnalyzerUtils.buildBagOfWordsQuery(StatusField.TEXT.name, IndexTweets.ANALYZER, topic.getQuery()); TopDocs rs = searcher.search(query, filter, searchArgs.hits); RerankerContext context = new RerankerContext(searcher, query, topic.getId(), topic.getQuery(), Sets.newHashSet(AnalyzerUtils.tokenize(IndexTweets.ANALYZER, topic.getQuery())), filter); ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context); for (int i = 0; i < docs.documents.length; i++) { String qid = topic.getId().replaceFirst("^MB0*", ""); out.println(String.format("%s Q0 %s %d %f %s", qid, docs.documents[i].getField(StatusField.ID.name).numericValue(), (i + 1), docs.scores[i], searchArgs.runtag)); } long qtime = (System.nanoTime() - curQueryTime) / 1000000; LOG.info("Query " + topic.getId() + " (elapsed time = " + qtime + "ms)"); totalTime += qtime; cnt++; } LOG.info("All queries completed!"); LOG.info("Total elapsed time = " + totalTime + "ms"); LOG.info("Average query latency = " + (totalTime / cnt) + "ms"); reader.close(); out.close(); }
From source file:io.anserini.search.SearchWebCollection.java
License:Apache License
public static void main(String[] args) throws IOException, ParseException { long curTime = System.nanoTime(); SearchArgs searchArgs = new SearchArgs(); CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90)); try {//ww w .java 2 s. com parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); parser.printUsage(System.err); System.err.println("Example: SearchWebCollection" + parser.printExample(OptionHandlerFilter.REQUIRED)); return; } LOG.info("Reading index at " + searchArgs.index); Directory dir; if (searchArgs.inmem) { LOG.info("Using MMapDirectory with preload"); dir = new MMapDirectory(Paths.get(searchArgs.index)); ((MMapDirectory) dir).setPreload(true); } else { LOG.info("Using default FSDirectory"); dir = FSDirectory.open(Paths.get(searchArgs.index)); } Similarity similarity = null; if (searchArgs.ql) { LOG.info("Using QL scoring model"); similarity = new LMDirichletSimilarity(searchArgs.mu); } else if (searchArgs.bm25) { LOG.info("Using BM25 scoring model"); similarity = new BM25Similarity(searchArgs.k1, searchArgs.b); } else { LOG.error("Error: Must specify scoring model!"); System.exit(-1); } RerankerCascade cascade = new RerankerCascade(); if (searchArgs.rm3) { cascade.add(new Rm3Reranker(new EnglishAnalyzer(), "body", "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.gov2.txt")); } else { cascade.add(new IdentityReranker()); } Path topicsFile = Paths.get(searchArgs.topics); if (!Files.exists(topicsFile) || !Files.isRegularFile(topicsFile) || !Files.isReadable(topicsFile)) { throw new IllegalArgumentException( "Topics file : " + topicsFile + " does not exist or is not a (readable) file."); } SortedMap<Integer, String> topics = io.anserini.document.Collection.GOV2.equals(searchArgs.collection) ? readTeraByteTackQueries(topicsFile) : readWebTrackQueries(topicsFile); SearchWebCollection searcher = new SearchWebCollection(searchArgs.index); searcher.search(topics, searchArgs.output, similarity, searchArgs.hits); searcher.close(); }
From source file:io.anserini.search.SimpleSearcher.java
License:Apache License
public SimpleSearcher(String indexDir) throws IOException { Path indexPath = Paths.get(indexDir); if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) { throw new IllegalArgumentException(indexDir + " does not exist or is not a directory."); }//from w w w . j a v a2 s. c o m this.reader = DirectoryReader.open(FSDirectory.open(indexPath)); this.similarity = new LMDirichletSimilarity(1000.0f); this.analyzer = new EnglishAnalyzer(); }
From source file:ir.ac.ut.engine.FeaturedRetriever.java
public static ScoreDoc[] search(String query, String qId, String field) throws IOException { float mu = (float) 1000; query = query.toLowerCase();//from ww w .j av a2 s. com BooleanQuery.setMaxClauseCount(query.length()); Analyzer analyzer; if (field.equals(IndexedDocument.FIELD_REAL_ID)) { analyzer = new SimpleAnalyzer(Version.LUCENE_CURRENT); } else if (field.equals(IndexedDocument.FIELD_NAMED_ENTITIES)) { analyzer = (new MyAnalyzer(false)).MyNgramAnalyzer(); } else if (field.equals(IndexedDocument.FIELD_SORTED_BIGRAMS)) { analyzer = (new MyAnalyzer(false)).MyNgramAnalyzer(); } else if (field.equals(IndexedDocument.FIELD_SORTED_TRIGRAMS)) { analyzer = (new MyAnalyzer(false)).MyNgramAnalyzer(); } else if (field.equals(IndexedDocument.FIELD_STOPWORDS3Gram)) { analyzer = (new MyAnalyzer(false)).MyNgramAnalyzer(); } else if (field.equals(IndexedDocument.FIELD_POS3GRAM)) { analyzer = (new MyAnalyzer(false)).MyNgramAnalyzer(); } else { analyzer = (new MyAnalyzer(false)).MyDefaultAnalyzer(); } QueryParser qParser = new QueryParser(Version.LUCENE_47, field, analyzer); Query q = null; try { q = qParser.parse(QueryParser.escape(query)); } catch (org.apache.lucene.queryparser.classic.ParseException e) { e.printStackTrace(); System.out.println("Exceptional Query:" + qId); return new ScoreDoc[0]; } Similarity simFunction = new LMDirichletSimilarity(mu); // Similarity simFunction = new BM25Similarity(); IndexSearcher isearcher = new IndexSearcher(ireader); isearcher.setSimilarity(simFunction); TopFieldCollector tfc = TopFieldCollector.create(Sort.RELEVANCE, ireader.numDocs(), true, true, true, false); isearcher.search(q, tfc); TopDocs results = tfc.topDocs(); ScoreDoc[] hits = results.scoreDocs; reportInTREC(hits, qId); return hits; }