Java tutorial
/* * Copyright 2015 Carnegie Mellon University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.cmu.lti.oaqa.knn4qa.cand_providers; import java.util.*; import java.io.*; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.*; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.*; import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.FSDirectory; import edu.cmu.lti.oaqa.knn4qa.letor.FeatureExtractor; import com.google.common.base.Splitter; public class LuceneCandidateProvider extends CandidateProvider { @Override public String getName() { return this.getClass().getName(); } public LuceneCandidateProvider(String indexDirName) throws Exception { File indexDir = new File(indexDirName); if (!indexDir.exists()) { throw new Exception(String.format("Directory '%s' doesn't exist", indexDirName)); } mReader = DirectoryReader.open(FSDirectory.open(indexDir)); mSearcher = new IndexSearcher(mReader); mSearcher.setSimilarity(mSimilarity); } /* * The function getCandidates is thread-safe, because IndexSearcher is thread safe: * https://wiki.apache.org/lucene-java/LuceneFAQ#Is_the_IndexSearcher_thread-safe.3F */ @Override public boolean isThreadSafe() { return true; } @Override public CandidateInfo getCandidates(int queryNum, Map<String, String> queryData, int maxQty) throws Exception { ArrayList<CandidateEntry> resArr = new ArrayList<CandidateEntry>(); String queryID = queryData.get(ID_FIELD_NAME); if (null == queryID) { throw new Exception( String.format("Query id (%s) is undefined for query # %d", ID_FIELD_NAME, queryNum)); } String text = queryData.get(TEXT_FIELD_NAME); if (null == text) { throw new Exception(String.format("Query (%s) is undefined for query # %d", TEXT_FIELD_NAME, queryNum)); } String query = text.trim(); ArrayList<String> toks = new ArrayList<String>(); for (String s : mSpaceSplit.split(query)) { toks.add(s); } if (2 * toks.size() > BooleanQuery.getMaxClauseCount()) { // This a heuristic, but it should work fine in many cases BooleanQuery.setMaxClauseCount(2 * toks.size()); } int numFound = 0; if (!query.isEmpty()) { // QueryParser cannot be shared among threads! QueryParser parser = new QueryParser(TEXT_FIELD_NAME, mAnalyzer); parser.setDefaultOperator(QueryParser.OR_OPERATOR); Query queryParsed = parser.parse(query); TopDocs hits = mSearcher.search(queryParsed, maxQty); numFound = hits.totalHits; ScoreDoc[] scoreDocs = hits.scoreDocs; for (ScoreDoc oneHit : scoreDocs) { Document doc = mSearcher.doc(oneHit.doc); String id = doc.get(ID_FIELD_NAME); float score = oneHit.score; resArr.add(new CandidateEntry(id, score)); } } CandidateEntry[] results = resArr.toArray(new CandidateEntry[resArr.size()]); Arrays.sort(results); return new CandidateInfo(numFound, results); } private IndexReader mReader = null; private IndexSearcher mSearcher = null; private Similarity mSimilarity = new BM25Similarity(FeatureExtractor.BM25_K1, FeatureExtractor.BM25_B); private Analyzer mAnalyzer = new WhitespaceAnalyzer(); private static Splitter mSpaceSplit = Splitter.on(' ').omitEmptyStrings().trimResults(); }