List of usage examples for org.apache.lucene.search BooleanQuery setMaxClauseCount
public static void setMaxClauseCount(int maxClauseCount)
From source file:edu.cmu.lti.oaqa.knn4qa.cand_providers.LuceneCandidateProvider.java
License:Apache License
@Override public CandidateInfo getCandidates(int queryNum, Map<String, String> queryData, int maxQty) throws Exception { ArrayList<CandidateEntry> resArr = new ArrayList<CandidateEntry>(); String queryID = queryData.get(ID_FIELD_NAME); if (null == queryID) { throw new Exception( String.format("Query id (%s) is undefined for query # %d", ID_FIELD_NAME, queryNum)); }// w w w. ja v a2 s.c om String text = queryData.get(TEXT_FIELD_NAME); if (null == text) { throw new Exception(String.format("Query (%s) is undefined for query # %d", TEXT_FIELD_NAME, queryNum)); } String query = text.trim(); ArrayList<String> toks = new ArrayList<String>(); for (String s : mSpaceSplit.split(query)) { toks.add(s); } if (2 * toks.size() > BooleanQuery.getMaxClauseCount()) { // This a heuristic, but it should work fine in many cases BooleanQuery.setMaxClauseCount(2 * toks.size()); } int numFound = 0; if (!query.isEmpty()) { // QueryParser cannot be shared among threads! QueryParser parser = new QueryParser(TEXT_FIELD_NAME, mAnalyzer); parser.setDefaultOperator(QueryParser.OR_OPERATOR); Query queryParsed = parser.parse(query); TopDocs hits = mSearcher.search(queryParsed, maxQty); numFound = hits.totalHits; ScoreDoc[] scoreDocs = hits.scoreDocs; for (ScoreDoc oneHit : scoreDocs) { Document doc = mSearcher.doc(oneHit.doc); String id = doc.get(ID_FIELD_NAME); float score = oneHit.score; resArr.add(new CandidateEntry(id, score)); } } CandidateEntry[] results = resArr.toArray(new CandidateEntry[resArr.size()]); Arrays.sort(results); return new CandidateInfo(numFound, results); }
From source file:edu.cmu.lti.oaqa.knn4qa.cand_providers.TranRecSortByProb.java
License:Apache License
@Override public CandidateInfo getCandidates(int queryNum, Map<String, String> queryData, int maxQty) throws Exception { ArrayList<CandidateEntry> resArr = new ArrayList<CandidateEntry>(); String queryID = queryData.get(ID_FIELD_NAME); if (null == queryID) { throw new Exception( String.format("Query id (%s) is undefined for query # %d", ID_FIELD_NAME, queryNum)); }/*ww w . j av a2 s .c o m*/ String text = queryData.get(TEXT_FIELD_NAME); if (null == text) { throw new Exception(String.format("Query (%s) is undefined for query # %d", TEXT_FIELD_NAME, queryNum)); } String origQuery = text.trim(); int numFound = 0; if (!origQuery.isEmpty()) { StringBuffer queryToks = new StringBuffer(); int tokQty = 0; for (String w : origQuery.split("\\s+")) if (!w.isEmpty()) { tokQty++; queryToks.append(w + " "); final WordEntry we = mFieldIndex.getWordEntry(w); if (we != null) { GizaOneWordTranRecs rec0 = mAnswToQuestTran.getTranProbs(we.mWordId); if (rec0 != null) { TranRecSortByProb rec[] = new TranRecSortByProb[rec0.mDstIds.length]; for (int i = 0; i < rec0.mDstIds.length; ++i) rec[i] = new TranRecSortByProb(rec0.mDstIds[i], rec0.mProbs[i]); Arrays.sort(rec); int qty = mTopTranQty; for (int i = 0; i < Math.min(rec.length, qty); ++i) { int dstId = rec[i].mDstWorId; if (dstId != we.mWordId) { queryToks.append(mFieldIndex.getWord(dstId) + (mUseWeights ? "^" + String.format("%.3f", rec[i].mProb) : "") + " "); ++tokQty; } else { // If we skip a word, b/c it's the same as the query word // we will get one more candidate so that exactly Math.min(mTopTranQty, rec.length) // words were added ++qty; } } } } } synchronized (lock) { // this is a static lock, it will block all instance of this class if (tokQty > BooleanQuery.getMaxClauseCount()) { BooleanQuery.setMaxClauseCount(tokQty); } } String luceneQuery = queryToks.toString().trim(); if (!luceneQuery.isEmpty()) { // QueryParser cannot be shared among threads! QueryParser parser = new QueryParser(TEXT_FIELD_NAME, mAnalyzer); parser.setDefaultOperator(QueryParser.OR_OPERATOR); Query queryParsed = parser.parse(luceneQuery); //System.out.println("The resulting query: " + luceneQuery.toString()); TopDocs hits = mSearcher.search(queryParsed, maxQty); numFound = hits.totalHits; ScoreDoc[] scoreDocs = hits.scoreDocs; for (ScoreDoc oneHit : scoreDocs) { Document doc = mSearcher.doc(oneHit.doc); String id = doc.get(ID_FIELD_NAME); float score = oneHit.score; resArr.add(new CandidateEntry(id, score)); } } } CandidateEntry[] results = resArr.toArray(new CandidateEntry[resArr.size()]); Arrays.sort(results); return new CandidateInfo(numFound, results); }
From source file:edu.harvard.iq.dvn.core.index.Indexer.java
License:Apache License
public List searchVariables(List<Long> studyIds, SearchTerm searchTerm) throws IOException { BooleanQuery indexQuery = null;/* w w w . j ava 2 s. c o m*/ BooleanQuery searchQuery = new BooleanQuery(); BooleanQuery.setMaxClauseCount(dvnMaxClauseCount); if (studyIds != null) { searchQuery.add(orIdSearchTermClause(studyIds, "varStudyId"), BooleanClause.Occur.MUST); } if (searchTerm.getFieldName().equalsIgnoreCase("variable")) { indexQuery = buildVariableQuery(searchTerm); if (searchTerm.getOperator().equals("=")) { searchQuery.add(indexQuery, BooleanClause.Occur.MUST); } else { searchQuery.add(indexQuery, BooleanClause.Occur.MUST_NOT); } } List<Document> variableResults = getHits(searchQuery); List<Long> variableIdResults = getVariableHitIds(variableResults); // TODO: // Double-check if the intersectionVarDocResults() below - i.e., filtering // the hit list against the list of supplied study ids - is necessary at all. // I would think not - because the study IDs were already added to the // search query, above. -- L.A. List<Long> finalResults = studyIds != null ? intersectionVarDocResults(variableResults, studyIds) : variableIdResults; return finalResults; }
From source file:edu.harvard.iq.dvn.core.index.Indexer.java
License:Apache License
public List searchVariables(List<Long> studyIds, List<SearchTerm> searchTerms, boolean varIdReturnValues) throws IOException { BooleanQuery searchQuery = new BooleanQuery(); BooleanQuery.setMaxClauseCount(dvnMaxClauseCount); if (studyIds != null) { searchQuery.add(orIdSearchTermClause(studyIds, "varStudyId"), BooleanClause.Occur.MUST); }/* w ww. j ava 2 s.c o m*/ for (Iterator it = searchTerms.iterator(); it.hasNext();) { SearchTerm elem = (SearchTerm) it.next(); BooleanQuery indexQuery = null; if (elem.getFieldName().equalsIgnoreCase("variable")) { indexQuery = buildVariableQuery(elem); if (elem.getOperator().equals("=")) { searchQuery.add(indexQuery, BooleanClause.Occur.MUST); } else { searchQuery.add(indexQuery, BooleanClause.Occur.MUST_NOT); } } } List<Long> finalResults = null; // TODO: // Double-check if the intersection(Var)DocResults() below - i.e., filtering // the hit list against the list of supplied study ids - is necessary at all. // I would think not - because the study IDs were already added to the // search query, above. -- L.A. if (varIdReturnValues) { List<Document> variableResults = getHits(searchQuery); List<Long> variableIdResults = getVariableHitIds(variableResults); finalResults = studyIds != null ? intersectionVarDocResults(variableResults, studyIds) : variableIdResults; } else { List<Long> studyIdResults = getVariableHitStudyIds(searchQuery); // gets the study ids finalResults = studyIds != null ? intersectionResults(studyIdResults, studyIds) : studyIdResults; } return finalResults; }
From source file:edu.harvard.iq.dvn.core.index.Indexer.java
License:Apache License
public List searchFileMetadata(List<Long> studyIds, List<SearchTerm> searchTerms, boolean fileIdReturnValues) throws IOException { BooleanQuery searchQuery = new BooleanQuery(); BooleanQuery.setMaxClauseCount(dvnMaxClauseCount); if (studyIds != null) { searchQuery.add(orIdSearchTermClause(studyIds, "id"), BooleanClause.Occur.MUST); }//from w w w. j av a 2 s.co m for (Iterator it = searchTerms.iterator(); it.hasNext();) { SearchTerm elem = (SearchTerm) it.next(); BooleanQuery indexQuery = null; // Determine if this is a file-level metadata search term: if (isFileMetadataField(elem.getFieldName())) { indexQuery = buildFileMetadataQuery(elem); logger.fine("INDEXER: filemetadata element query (native): " + indexQuery.toString()); if (elem.getOperator().equals("=")) { // We only support "=" on file metadata, for now, anyway. // -- L.A. searchQuery.add(indexQuery, BooleanClause.Occur.MUST); } else { searchQuery.add(indexQuery, BooleanClause.Occur.MUST_NOT); } } } logger.fine("INDEXER: filemetadata combined query (native): " + searchQuery.toString()); List<Long> finalResults = null; // TODO: // Double-check if the intersection(File)DocResults() below - i.e., filtering // the hit list against the list of supplied study ids - is necessary at all. // I would think not - because the study IDs were already added to the // search query, above. -- L.A. if (fileIdReturnValues) { List<Document> fileMetadataResults = getHits(searchQuery); List<Long> fileMetadataIdResults = getFileMetadataHitIds(fileMetadataResults); finalResults = studyIds != null ? intersectionFileDocResults(fileMetadataResults, studyIds) : fileMetadataIdResults; } else { List<Long> studyIdResults = getFileMetadataHitStudyIds(searchQuery); // gets the study ids finalResults = studyIds != null ? intersectionResults(studyIdResults, studyIds) : studyIdResults; } return finalResults; }
From source file:edu.harvard.iq.dvn.core.index.Indexer.java
License:Apache License
BooleanQuery orPhraseQuery(List<SearchTerm> orSearchTerms) { BooleanQuery orTerms = new BooleanQuery(); orTerms.setMaxClauseCount(dvnMaxClauseCount); for (Iterator it = orSearchTerms.iterator(); it.hasNext();) { SearchTerm elem = (SearchTerm) it.next(); String[] phrase = getPhrase(elem.getValue().toLowerCase().trim()); if (phrase.length > 1) { PhraseQuery phraseQuery = new PhraseQuery(); phraseQuery.setSlop(10);// w w w . j a v a2 s . c o m for (int i = 0; i < phrase.length; i++) { phraseQuery.add(new Term(elem.getFieldName(), phrase[i].toLowerCase().trim())); } orTerms.add(phraseQuery, BooleanClause.Occur.SHOULD); } else if (phrase.length == 1) { // Term t = new Term(elem.getFieldName(), elem.getValue().toLowerCase().trim()); logger.fine("INDEXER: orPhraseQuery: search element value: " + phrase[0].toLowerCase().trim()); Term t = new Term(elem.getFieldName(), phrase[0].toLowerCase().trim()); logger.fine("INDEXER: orPhraseQuery: term value=" + t.text()); TermQuery orQuery = new TermQuery(t); logger.fine("INDEXER: TermQuery orQuery (native): " + orQuery.toString()); orTerms.add(orQuery, BooleanClause.Occur.SHOULD); } } return orTerms; }
From source file:edu.harvard.iq.dvn.core.index.Indexer.java
License:Apache License
BooleanQuery orPhraseOrWildcardQuery(List<SearchTerm> orSearchTerms) { BooleanQuery orTerms = new BooleanQuery(); orTerms.setMaxClauseCount(dvnMaxClauseCount); for (Iterator it = orSearchTerms.iterator(); it.hasNext();) { SearchTerm elem = (SearchTerm) it.next(); String[] phrase = getPhrase(elem.getValue().toLowerCase().trim()); if (phrase.length > 1) { PhraseQuery phraseQuery = new PhraseQuery(); phraseQuery.setSlop(10);/*from ww w . j a v a 2 s . c om*/ for (int i = 0; i < phrase.length; i++) { phraseQuery.add(new Term(elem.getFieldName(), phrase[i].toLowerCase().trim())); } orTerms.add(phraseQuery, BooleanClause.Occur.SHOULD); } else if (phrase.length == 1) { // Term t = new Term(elem.getFieldName(), elem.getValue().toLowerCase().trim()); logger.fine("INDEXER: wildcardQuery: search element value: " + phrase[0].toLowerCase().trim()); if (isPrefixSearchableFileMetadataField(elem.getFieldName())) { Term t = new Term(elem.getFieldName(), phrase[0].toLowerCase().trim() + "*"); logger.fine("INDEXER: wildcardQuery: term value=" + t.text()); WildcardQuery wcQuery = new WildcardQuery(t); logger.fine("INDEXER: Term wildcardQuery (native): " + wcQuery.toString()); orTerms.add(wcQuery, BooleanClause.Occur.SHOULD); } else { logger.fine("INDEXER: building PhraseQuery: search element value: " + phrase[0].toLowerCase().trim()); Term t = new Term(elem.getFieldName(), phrase[0].toLowerCase().trim()); logger.fine("INDEXER: building PhraseQuery: term value=" + t.text()); TermQuery orQuery = new TermQuery(t); logger.fine("INDEXER: TermQuery orQuery (native): " + orQuery.toString()); orTerms.add(orQuery, BooleanClause.Occur.SHOULD); } } } return orTerms; }
From source file:edu.harvard.iq.dvn.core.index.Indexer.java
License:Apache License
public BooleanQuery andSearchTermClause(List<SearchTerm> andSearchTerms) { BooleanQuery andTerms = new BooleanQuery(); andTerms.setMaxClauseCount(dvnMaxClauseCount); Query rQuery = null;/*www . j ava 2 s. c om*/ for (Iterator it = andSearchTerms.iterator(); it.hasNext();) { SearchTerm elem = (SearchTerm) it.next(); if (elem.getOperator().equals("<")) { Term end = new Term(elem.getFieldName(), elem.getValue().toLowerCase().trim()); Term begin = null; rQuery = new TermRangeQuery(elem.getFieldName(), null, elem.getValue().toLowerCase().trim(), false, false); // rQuery = new RangeQuery(begin,end,true); andTerms.add(rQuery, BooleanClause.Occur.MUST); } else if (elem.getOperator().equals(">")) { Term end = null; Term begin = new Term(elem.getFieldName(), elem.getValue().toLowerCase().trim()); rQuery = new TermRangeQuery(elem.getFieldName(), elem.getValue().toLowerCase().trim(), null, false, false); // rQuery = new RangeQuery(begin,end,true); andTerms.add(rQuery, BooleanClause.Occur.MUST); } else if (elem.getFieldName().equalsIgnoreCase("any")) { andTerms = buildAnyQuery(elem.getValue().toLowerCase().trim()); } else { String[] phrase = getPhrase(elem.getValue().toLowerCase().trim()); if (phrase.length > 1) { PhraseQuery phraseQuery = new PhraseQuery(); phraseQuery.setSlop(10); andTerms.add(partialMatch(elem, 10)); } else if (phrase.length == 1) { // Term t = new Term(elem.getFieldName(), elem.getValue().toLowerCase().trim()); Term t = new Term(elem.getFieldName(), phrase[0].toLowerCase().trim()); TermQuery andQuery = new TermQuery(t); if (elem.getOperator().equals("=")) { andTerms.add(andQuery, BooleanClause.Occur.MUST); } else if (elem.getOperator().equalsIgnoreCase("-")) { andTerms.add(andQuery, BooleanClause.Occur.MUST_NOT); } } } } return andTerms; }
From source file:edu.harvard.iq.dvn.core.index.Indexer.java
License:Apache License
BooleanQuery andQueryClause(List<BooleanQuery> andQueries) { BooleanQuery andTerms = new BooleanQuery(); BooleanQuery.setMaxClauseCount(dvnMaxClauseCount); for (Iterator it = andQueries.iterator(); it.hasNext();) { BooleanQuery elem = (BooleanQuery) it.next(); BooleanClause clause = new BooleanClause(elem, BooleanClause.Occur.MUST); andTerms.add(clause);/*from ww w . j a v a 2 s. c o m*/ } return andTerms; }
From source file:edu.isi.pfindr.learn.search.LuceneSearchEngine.java
License:Apache License
public static Map<String, Double> search(String queryString, String descriptionExpandedNotStemmed, String descriptionExpandedStemmed) { Map<String, Double> searchResultMap = new HashMap<String, Double>(); //Escape special characters in Lucene String originalDefinitionEscaped = LUCENE_PATTERN.matcher(queryString).replaceAll(REPLACEMENT_STRING_ESCAPE) .toLowerCase();//from w w w .j av a 2 s. c o m descriptionExpandedNotStemmed = LUCENE_PATTERN.matcher(descriptionExpandedNotStemmed) .replaceAll(REPLACEMENT_STRING_ESCAPE).toLowerCase(); descriptionExpandedStemmed = LUCENE_PATTERN.matcher(descriptionExpandedStemmed) .replaceAll(REPLACEMENT_STRING_ESCAPE).toLowerCase(); try { String originalDefinitionStemmedQuery = CleanDataUtil .preprocessStemAndTokenize(queryString.toLowerCase()).trim(); StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); Query query; //Get the top hits TopScoreDocCollector collector = TopScoreDocCollector.create(100000, true); indexSearcher = getIndexSearcher(); BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE); //+(+contents:hello +contents:world) +priority:high //"jakarta apache"^4 "Apache Lucene" //////////// if (!originalDefinitionStemmedQuery.equals("")) { originalDefinitionStemmedQuery = LUCENE_PATTERN.matcher(originalDefinitionStemmedQuery) .replaceAll(REPLACEMENT_STRING_ESCAPE); String[] fields = new String[] { "content", "contentStemmed", "contentExpanded", "contentExpandedStemmed" }; String[] queries = new String[] { "\"" + originalDefinitionEscaped.trim().toLowerCase() + "\"^8 " + originalDefinitionEscaped.trim().toLowerCase(), originalDefinitionStemmedQuery + "^3", descriptionExpandedNotStemmed, descriptionExpandedStemmed }; query = MultiFieldQueryParser.parse(Version.LUCENE_30, queries, fields, analyzer); } else { QueryParser queryParser = new QueryParser(Version.LUCENE_30, "content", analyzer); query = queryParser.parse("\"" + originalDefinitionEscaped.trim().toLowerCase() + "\"^8 " + originalDefinitionEscaped.trim().toLowerCase()); } //////// indexSearcher.search(query, collector); ScoreDoc[] scoreDocs = collector.topDocs().scoreDocs; int hitCount = collector.getTotalHits(); if (hitCount > 0) { //System.out.println("Hits for \"" + queryString + "\" were found by:"); // Iterate over the Documents in the Hits object ScoreDoc scoreDoc; for (int i = 0; i < hitCount; i++) { scoreDoc = scoreDocs[i]; //System.out.println("docId: " + scoreDoc.doc + "\t" + "docScore: " + scoreDoc.score); Document doc = indexSearcher.doc(scoreDoc.doc); //System.out.println(" " + (i + 1) + ". " + doc.get("id")); //System.out.println("Content: " + doc.get("orgContent")); if (!searchResultMap.containsKey((String) doc.get("orgContent"))) searchResultMap.put(((String) doc.get("orgContent")), new Double(scoreDoc.score)); } } analyzer = null; } catch (org.apache.lucene.queryParser.ParseException pe) { // TODO Auto-generated catch block pe.printStackTrace(); } catch (IOException ioe) { // TODO Auto-generated catch block ioe.printStackTrace(); } finally { /*try{ closeIndexSearcher(); }catch(IOException ioe){ ioe.printStackTrace(); }*/ } return searchResultMap; }