List of usage examples for org.apache.lucene.search.similarities ClassicSimilarity ClassicSimilarity
public ClassicSimilarity()
From source file:IrqaQuery.java
License:Apache License
public static void makeIndexWriter(String indexPath, String stopPath, String sim) throws IOException { System.out.println("[makeIndexWriter] started"); System.out.println("[makeIndexWriter]" + stopPath); Directory dir = FSDirectory.open(Paths.get(indexPath)); Analyzer analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(mygetStopwords(stopPath))); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); if (sim.equals("TFIDF")) iwc.setSimilarity(new ClassicSimilarity()); else if (sim.equals("BM25")) iwc.setSimilarity(new BM25Similarity()); else/*www .j a v a 2 s . c o m*/ iwc.setSimilarity(new BM25Similarity()); writer = new IndexWriter(dir, iwc); }
From source file:IrqaQuery.java
License:Apache License
public static List<Document> query(String index, String stoppath, String question, int numResult, String sim) throws Exception { IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(mygetStopwords(stoppath))); if (sim.equals("TFIDF")) searcher.setSimilarity(new ClassicSimilarity()); else if (sim.equals("BM25")) searcher.setSimilarity(new BM25Similarity()); else// w ww. j a va2 s . c o m searcher.setSimilarity(new BM25Similarity()); String field = "contents"; QueryParser parser = new QueryParser(field, analyzer); Query query = parser.parse(parser.escape(question)); TopDocs results = searcher.search(query, numResult); ScoreDoc[] hits = results.scoreDocs; List<Document> docs = new ArrayList<Document>(); int numTotalHits = results.totalHits; // System.out.println(numTotalHits + " total matching documents"); int end = Math.min(numTotalHits, numResult); String searchResult = ""; // System.out.println("Only results 1 - " + hits.length); for (int i = 0; i < end; i++) { Document doc = searcher.doc(hits[i].doc); docs.add(doc); } return docs; }
From source file:KNearestNeighborClassifier.java
License:Apache License
/** * Creates a {@link KNearestNeighborClassifier}. * * @param leafReader the reader on the index to be used for classification * @param analyzer an {@link Analyzer} used to analyze unseen text * @param similarity the {@link Similarity} to be used by the underlying {@link IndexSearcher} or {@code null} * (defaults to {@link org.apache.lucene.search.similarities.ClassicSimilarity}) * @param query a {@link Query} to eventually filter the docs used for training the classifier, or {@code null} * if all the indexed docs should be used * @param k the no. of docs to select in the MLT results to find the nearest neighbor * @param minDocsFreq {@link MoreLikeThis#minDocFreq} parameter * @param minTermFreq {@link MoreLikeThis#minTermFreq} parameter * @param classFieldName the name of the field used as the output for the classifier * @param textFieldNames the name of the fields used as the inputs for the classifier, they can contain boosting indication e.g. title^10 *///from www .j a v a2s.c o m public KNearestNeighborClassifier(IndexReader leafReader, Similarity similarity, Analyzer analyzer, Query query, int k, int minDocsFreq, int minTermFreq, String classFieldName, String... textFieldNames) { this.textFieldNames = textFieldNames; this.classFieldName = classFieldName; this.mlt = new MoreLikeThis(leafReader); this.mlt.setAnalyzer(analyzer); this.mlt.setFieldNames(textFieldNames); this.indexSearcher = new IndexSearcher(leafReader); if (similarity != null) { this.indexSearcher.setSimilarity(similarity); } else { this.indexSearcher.setSimilarity(new ClassicSimilarity()); } if (minDocsFreq > 0) { mlt.setMinDocFreq(minDocsFreq); } if (minTermFreq > 0) { mlt.setMinTermFreq(minTermFreq); } this.query = query; this.k = k; }
From source file:luceneInterface.java
License:Apache License
public static List<Document> query(String index, String stoppath, String question, int numResult, String sim) throws Exception { IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(mygetStopwords(stoppath))); if (sim.equals("TFIDF")) searcher.setSimilarity(new ClassicSimilarity()); else if (sim.equals("BM25")) searcher.setSimilarity(new BM25Similarity()); else//from w w w .j av a 2 s . co m searcher.setSimilarity(new BM25Similarity()); String field = "contents"; QueryParser parser = new QueryParser(field, analyzer); Query query = parser.parse(parser.escape(question)); BooleanQuery.Builder bqb = new BooleanQuery.Builder(); bqb.add(new TermQuery(new Term("contents", parser.escape(question))), BooleanClause.Occur.SHOULD); bqb.add(new TermQuery(new Term("sec", parser.escape(question))), BooleanClause.Occur.SHOULD); // Term term = new Term(field, question); // Query query = new TermQuery(term); // TopDocs results = searcher.search(query, numResult); TopDocs results = searcher.search(parser.parse(bqb.build().toString()), numResult); ScoreDoc[] hits = results.scoreDocs; List<Document> docs = new ArrayList<Document>(); int numTotalHits = results.totalHits; // System.out.println(numTotalHits + " total matching documents"); int end = Math.min(numTotalHits, numResult); String searchResult = ""; // System.out.println("Only results 1 - " + hits.length); for (int i = 0; i < end; i++) { Document doc = searcher.doc(hits[i].doc); docs.add(doc); } return docs; }
From source file:ai.castor.idf.FetchTermIDF.java
License:Apache License
public double getTermIDF(String term) throws ParseException { Analyzer analyzer = new EnglishAnalyzer(CharArraySet.EMPTY_SET); QueryParser qp = new QueryParser(FIELD_BODY, analyzer); ClassicSimilarity similarity = new ClassicSimilarity(); String esTerm = qp.escape(term); double termIDF = 0.0; try {/* ww w .ja v a 2s . c o m*/ TermQuery q = (TermQuery) qp.parse(esTerm); Term t = q.getTerm(); termIDF = similarity.idf(reader.docFreq(t), reader.numDocs()); System.out.println(term + '\t' + esTerm + '\t' + q + '\t' + t + '\t' + termIDF); } catch (Exception e) { System.err.println("Exception in fetching IDF(" + term + "): " + e.toString()); } return termIDF; }
From source file:ai.castor.idf.IDFScorer.java
License:Apache License
public double calcIDF(String query, String answer, boolean analyze) throws ParseException { Analyzer analyzer;// ww w .ja v a 2 s . c o m if (analyze) { analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords)); } else { analyzer = new WhitespaceAnalyzer(); } QueryParser qp = new QueryParser(FIELD_BODY, analyzer); ClassicSimilarity similarity = new ClassicSimilarity(); String escapedQuery = qp.escape(query); Query question = qp.parse(escapedQuery); HashSet<String> questionTerms = new HashSet<>(Arrays.asList(question.toString().trim().split("\\s+"))); double idf = 0.0; HashSet<String> seenTerms = new HashSet<>(); String[] terms = answer.split("\\s+"); for (String term : terms) { try { TermQuery q = (TermQuery) qp.parse(term); Term t = q.getTerm(); if (questionTerms.contains(t.toString()) && !seenTerms.contains(t.toString())) { idf += similarity.idf(reader.docFreq(t), reader.numDocs()); seenTerms.add(t.toString()); } else { idf += 0.0; } } catch (Exception e) { continue; } } return idf; }
From source file:com.github.tteofili.looseen.Test20NewsgroupsClassification.java
License:Apache License
@Test public void test20Newsgroups() throws Exception { String indexProperty = System.getProperty("index"); if (indexProperty != null) { try {//from w w w .jav a2 s.c o m index = Boolean.valueOf(indexProperty); } catch (Exception e) { // ignore } } String splitProperty = System.getProperty("split"); if (splitProperty != null) { try { split = Boolean.valueOf(splitProperty); } catch (Exception e) { // ignore } } Path mainIndexPath = Paths.get(INDEX + "/original"); Directory directory = FSDirectory.open(mainIndexPath); Path trainPath = Paths.get(INDEX + "/train"); Path testPath = Paths.get(INDEX + "/test"); Path cvPath = Paths.get(INDEX + "/cv"); FSDirectory cv = null; FSDirectory test = null; FSDirectory train = null; IndexReader testReader = null; if (split) { cv = FSDirectory.open(cvPath); test = FSDirectory.open(testPath); train = FSDirectory.open(trainPath); } if (index) { delete(mainIndexPath); if (split) { delete(trainPath, testPath, cvPath); } } IndexReader reader = null; List<Classifier<BytesRef>> classifiers = new LinkedList<>(); try { Analyzer analyzer = new StandardAnalyzer(); if (index) { System.out.format("Indexing 20 Newsgroups...%n"); long startIndex = System.currentTimeMillis(); IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(analyzer)); buildIndex(new File(PREFIX + "/20n/20_newsgroups"), indexWriter); long endIndex = System.currentTimeMillis(); System.out.format("Indexed %d pages in %ds %n", indexWriter.maxDoc(), (endIndex - startIndex) / 1000); indexWriter.close(); } if (split && !index) { reader = DirectoryReader.open(train); } else { reader = DirectoryReader.open(directory); } if (index && split) { // split the index System.out.format("Splitting the index...%n"); long startSplit = System.currentTimeMillis(); DatasetSplitter datasetSplitter = new DatasetSplitter(0.1, 0); datasetSplitter.split(reader, train, test, cv, analyzer, false, CATEGORY_FIELD, BODY_FIELD, SUBJECT_FIELD, CATEGORY_FIELD); reader.close(); reader = DirectoryReader.open(train); // using the train index from now on long endSplit = System.currentTimeMillis(); System.out.format("Splitting done in %ds %n", (endSplit - startSplit) / 1000); } final long startTime = System.currentTimeMillis(); classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 1, 0, 0, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 1, 0, 0, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 0, 0, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new AxiomaticF1EXP(), analyzer, null, 3, 0, 0, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new AxiomaticF1LOG(), analyzer, null, 3, 0, 0, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new LMDirichletSimilarity(), analyzer, null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer, null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new DFRSimilarity(new BasicModelG(), new AfterEffectB(), new NormalizationH1()), analyzer, null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH3()), analyzer, null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new IBSimilarity(new DistributionSPL(), new LambdaDF(), new Normalization.NoNormalization()), analyzer, null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new MinHashClassifier(reader, BODY_FIELD, CATEGORY_FIELD, 15, 1, 100)); classifiers.add(new MinHashClassifier(reader, BODY_FIELD, CATEGORY_FIELD, 30, 3, 300)); classifiers.add(new MinHashClassifier(reader, BODY_FIELD, CATEGORY_FIELD, 10, 1, 100)); classifiers.add(new KNearestFuzzyClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer, null, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestFuzzyClassifier(reader, new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, CATEGORY_FIELD, BODY_FIELD)); classifiers .add(new KNearestFuzzyClassifier(reader, null, analyzer, null, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers .add(new KNearestFuzzyClassifier(reader, null, analyzer, null, 3, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestFuzzyClassifier(reader, new AxiomaticF1EXP(), analyzer, null, 3, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestFuzzyClassifier(reader, new AxiomaticF1LOG(), analyzer, null, 3, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new BM25NBClassifier(reader, analyzer, null, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new CachingNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new SimpleNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, BODY_FIELD)); int maxdoc; if (split) { testReader = DirectoryReader.open(test); maxdoc = testReader.maxDoc(); } else { maxdoc = reader.maxDoc(); } System.out.format("Starting evaluation on %d docs...%n", maxdoc); ExecutorService service = Executors.newCachedThreadPool(); List<Future<String>> futures = new LinkedList<>(); for (Classifier<BytesRef> classifier : classifiers) { testClassifier(reader, startTime, testReader, service, futures, classifier); } for (Future<String> f : futures) { System.out.println(f.get()); } Thread.sleep(10000); service.shutdown(); } finally { if (reader != null) { reader.close(); } directory.close(); if (test != null) { test.close(); } if (train != null) { train.close(); } if (cv != null) { cv.close(); } if (testReader != null) { testReader.close(); } for (Classifier c : classifiers) { if (c instanceof Closeable) { ((Closeable) c).close(); } } } }
From source file:com.github.tteofili.looseen.TestWikipediaClassification.java
License:Apache License
@Test public void testItalianWikipedia() throws Exception { String indexProperty = System.getProperty("index"); if (indexProperty != null) { try {// w w w . ja v a2s. c o m index = Boolean.valueOf(indexProperty); } catch (Exception e) { // ignore } } String splitProperty = System.getProperty("split"); if (splitProperty != null) { try { split = Boolean.valueOf(splitProperty); } catch (Exception e) { // ignore } } Path mainIndexPath = Paths.get(INDEX + "/original"); Directory directory = FSDirectory.open(mainIndexPath); Path trainPath = Paths.get(INDEX + "/train"); Path testPath = Paths.get(INDEX + "/test"); Path cvPath = Paths.get(INDEX + "/cv"); FSDirectory cv = null; FSDirectory test = null; FSDirectory train = null; DirectoryReader testReader = null; if (split) { cv = FSDirectory.open(cvPath); test = FSDirectory.open(testPath); train = FSDirectory.open(trainPath); } if (index) { delete(mainIndexPath); if (split) { delete(trainPath, testPath, cvPath); } } IndexReader reader = null; try { Collection<String> stopWordsList = Arrays.asList("di", "a", "da", "in", "per", "tra", "fra", "il", "lo", "la", "i", "gli", "le"); CharArraySet stopWords = new CharArraySet(stopWordsList, true); Analyzer analyzer = new ItalianAnalyzer(stopWords); if (index) { System.out.format("Indexing Italian Wikipedia...%n"); long startIndex = System.currentTimeMillis(); IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(analyzer)); importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current1.xml"), indexWriter); importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current2.xml"), indexWriter); importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current3.xml"), indexWriter); importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current4.xml"), indexWriter); long endIndex = System.currentTimeMillis(); System.out.format("Indexed %d pages in %ds %n", indexWriter.maxDoc(), (endIndex - startIndex) / 1000); indexWriter.close(); } if (split && !index) { reader = DirectoryReader.open(train); } else { reader = DirectoryReader.open(directory); } if (index && split) { // split the index System.out.format("Splitting the index...%n"); long startSplit = System.currentTimeMillis(); DatasetSplitter datasetSplitter = new DatasetSplitter(0.1, 0); for (LeafReaderContext context : reader.leaves()) { datasetSplitter.split(context.reader(), train, test, cv, analyzer, false, CATEGORY_FIELD, TEXT_FIELD, CATEGORY_FIELD); } reader.close(); reader = DirectoryReader.open(train); // using the train index from now on long endSplit = System.currentTimeMillis(); System.out.format("Splitting done in %ds %n", (endSplit - startSplit) / 1000); } final long startTime = System.currentTimeMillis(); List<Classifier<BytesRef>> classifiers = new LinkedList<>(); classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 1, 0, 0, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new BM25Similarity(), analyzer, null, 1, 0, 0, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 1, 0, 0, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new LMDirichletSimilarity(), analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 0, 0, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new DFRSimilarity(new BasicModelG(), new AfterEffectB(), new NormalizationH1()), analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH3()), analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new IBSimilarity(new DistributionSPL(), new LambdaDF(), new Normalization.NoNormalization()), analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 5, 1, 100)); classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 10, 1, 100)); classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 1, 100)); classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 3, 100)); classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 3, 300)); classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 5, 3, 100)); classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestFuzzyClassifier(reader, new BM25Similarity(), analyzer, null, 3, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestFuzzyClassifier(reader, new BM25Similarity(), analyzer, null, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new BM25NBClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new CachingNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new SimpleNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD)); int maxdoc; if (split) { testReader = DirectoryReader.open(test); maxdoc = testReader.maxDoc(); } else { maxdoc = reader.maxDoc(); } System.out.format("Starting evaluation on %d docs...%n", maxdoc); ExecutorService service = Executors.newCachedThreadPool(); List<Future<String>> futures = new LinkedList<>(); for (Classifier<BytesRef> classifier : classifiers) { final IndexReader finalReader = reader; final DirectoryReader finalTestReader = testReader; futures.add(service.submit(() -> { ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix; if (split) { confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(finalTestReader, classifier, CATEGORY_FIELD, TEXT_FIELD, 60000 * 30); } else { confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(finalReader, classifier, CATEGORY_FIELD, TEXT_FIELD, 60000 * 30); } final long endTime = System.currentTimeMillis(); final int elapse = (int) (endTime - startTime) / 1000; return " * " + classifier + " \n * accuracy = " + confusionMatrix.getAccuracy() + "\n * precision = " + confusionMatrix.getPrecision() + "\n * recall = " + confusionMatrix.getRecall() + "\n * f1-measure = " + confusionMatrix.getF1Measure() + "\n * avgClassificationTime = " + confusionMatrix.getAvgClassificationTime() + "\n * time = " + elapse + " (sec)\n "; })); } for (Future<String> f : futures) { System.out.println(f.get()); } Thread.sleep(10000); service.shutdown(); } finally { try { if (reader != null) { reader.close(); } if (directory != null) { directory.close(); } if (test != null) { test.close(); } if (train != null) { train.close(); } if (cv != null) { cv.close(); } if (testReader != null) { testReader.close(); } } catch (Throwable e) { e.printStackTrace(); } } }
From source file:com.mycompany.lucenedemo.SearchFiles.java
/** Simple command-line based search demo. */ public static void main(String[] args) throws Exception { String usage = "Usage:\tjava org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-query string] [-raw] [-paging hitsPerPage] [-sim vsm or bm25]\n\nSee http://lucene.apache.org/core/4_1_0/demo/ for details."; if (args.length == 0 || (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0])))) { System.out.println(usage); System.exit(0);/* ww w . j a va2 s . c o m*/ } String index = "index"; String field = "contents"; String queries = null; int repeat = 0; boolean raw = false; String queryString = null; int hitsPerPage = 10; SimilarityScore score = SimilarityScore.DEFAULT; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { index = args[i + 1]; i++; } else if ("-field".equals(args[i])) { field = args[i + 1]; i++; } else if ("-queries".equals(args[i])) { queries = args[i + 1]; i++; } else if ("-query".equals(args[i])) { queryString = args[i + 1]; i++; } else if ("-repeat".equals(args[i])) { repeat = Integer.parseInt(args[i + 1]); i++; } else if ("-raw".equals(args[i])) { raw = true; } else if ("-paging".equals(args[i])) { hitsPerPage = Integer.parseInt(args[i + 1]); if (hitsPerPage <= 0) { System.err.println("There must be at least 1 hit per page."); System.exit(1); } i++; } else if ("-sim".equals(args[i])) { if (args[i + 1].equals("vsm")) { score = SimilarityScore.VSM; } else { score = SimilarityScore.BM25; } } } IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new StandardAnalyzer(); BufferedReader in = null; if (queries != null) { in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8); } else { in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8)); } QueryParser parser = new QueryParser(field, analyzer); while (true) { if (queries == null && queryString == null) { // prompt the user System.out.println("Enter query: "); } String line = queryString != null ? queryString : in.readLine(); if (line == null || line.length() == -1) { break; } line = line.trim(); if (line.length() == 0) { break; } Query query = parser.parse(line); System.out.println("Searching for: " + query.toString(field)); switch (score) { case DEFAULT: break; case VSM: searcher.setSimilarity(new ClassicSimilarity()); break; case BM25: searcher.setSimilarity(new BM25Similarity()); break; } if (repeat > 0) { // repeat & time as benchmark Date start = new Date(); for (int i = 0; i < repeat; i++) { searcher.search(query, 100); } Date end = new Date(); System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms"); } doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null); if (queryString != null) { break; } } reader.close(); }
From source file:com.o19s.es.explore.ExplorerQuery.java
License:Apache License
@Override public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { if (!needsScores) { return searcher.createWeight(query, false, boost); }//w ww. j a v a 2 s. c o m final Weight subWeight = searcher.createWeight(query, true, boost); Set<Term> terms = new HashSet<>(); subWeight.extractTerms(terms); if (isCollectionScoped()) { ClassicSimilarity sim = new ClassicSimilarity(); StatisticsHelper df_stats = new StatisticsHelper(); StatisticsHelper idf_stats = new StatisticsHelper(); StatisticsHelper ttf_stats = new StatisticsHelper(); for (Term term : terms) { TermContext ctx = TermContext.build(searcher.getTopReaderContext(), term); TermStatistics tStats = searcher.termStatistics(term, ctx); df_stats.add(tStats.docFreq()); idf_stats.add(sim.idf(tStats.docFreq(), searcher.getIndexReader().numDocs())); ttf_stats.add(tStats.totalTermFreq()); } /* If no terms are parsed in the query we opt for returning 0 instead of throwing an exception that could break various pipelines. */ float constantScore; if (terms.size() > 0) { switch (type) { case ("sum_classic_idf"): constantScore = idf_stats.getSum(); break; case ("mean_classic_idf"): constantScore = idf_stats.getMean(); break; case ("max_classic_idf"): constantScore = idf_stats.getMax(); break; case ("min_classic_idf"): constantScore = idf_stats.getMin(); break; case ("stddev_classic_idf"): constantScore = idf_stats.getStdDev(); break; case "sum_raw_df": constantScore = df_stats.getSum(); break; case "min_raw_df": constantScore = df_stats.getMin(); break; case "max_raw_df": constantScore = df_stats.getMax(); break; case "mean_raw_df": constantScore = df_stats.getMean(); break; case "stddev_raw_df": constantScore = df_stats.getStdDev(); break; case "sum_raw_ttf": constantScore = ttf_stats.getSum(); break; case "min_raw_ttf": constantScore = ttf_stats.getMin(); break; case "max_raw_ttf": constantScore = ttf_stats.getMax(); break; case "mean_raw_ttf": constantScore = ttf_stats.getMean(); break; case "stddev_raw_ttf": constantScore = ttf_stats.getStdDev(); break; case "unique_terms_count": constantScore = terms.size(); break; default: throw new RuntimeException("Invalid stat type specified."); } } else { constantScore = 0.0f; } return new ConstantScoreWeight(ExplorerQuery.this, constantScore) { @Override public Explanation explain(LeafReaderContext context, int doc) throws IOException { Scorer scorer = scorer(context); int newDoc = scorer.iterator().advance(doc); assert newDoc == doc; // this is a DocIdSetIterator.all return Explanation.match(scorer.score(), "Stat Score: " + type); } @Override public Scorer scorer(LeafReaderContext context) throws IOException { return new ConstantScoreScorer(this, constantScore, DocIdSetIterator.all(context.reader().maxDoc())); } @Override public boolean isCacheable(LeafReaderContext ctx) { return true; } }; } else if (type.endsWith("_raw_tf")) { // Rewrite this into a boolean query where we can inject our PostingsExplorerQuery BooleanQuery.Builder qb = new BooleanQuery.Builder(); for (Term t : terms) { qb.add(new BooleanClause(new PostingsExplorerQuery(t, PostingsExplorerQuery.Type.TF), BooleanClause.Occur.SHOULD)); } // FIXME: completely refactor this class and stop accepting a random query but a list of terms directly // rewriting at this point is wrong, additionally we certainly build the TermContext twice for every terms // problem is that we rely on extractTerms which happen too late in the process Query q = qb.build().rewrite(searcher.getIndexReader()); return new ExplorerQuery.ExplorerWeight(this, searcher.createWeight(q, true, boost), type); } throw new IllegalArgumentException("Unknown ExplorerQuery type [" + type + "]"); }