List of usage examples for org.apache.lucene.search MultiPhraseQuery MultiPhraseQuery
MultiPhraseQuery
From source file:aos.lucene.search.advanced.MultiPhraseQueryTest.java
License:Apache License
public void testBasic() throws Exception { MultiPhraseQuery query = new MultiPhraseQuery(); query.add(new Term[] { // #A new Term("field", "quick"), // #A new Term("field", "fast") // #A });/*from ww w.ja v a 2 s . c om*/ query.add(new Term("field", "fox")); // #B LOGGER.info(query); TopDocs hits = searcher.search(query, 10); assertEquals("fast fox match", 1, hits.totalHits); query.setSlop(1); hits = searcher.search(query, 10); assertEquals("both match", 2, hits.totalHits); }
From source file:com.bewsia.script.safe.lucene.SEntity.java
License:Open Source License
public MultiPhraseQuery newMultiPhraseQuery() { return new MultiPhraseQuery(); }
From source file:com.greplin.lucene.query.PhrasePrefixQuery.java
License:Apache License
@Override public final Query rewrite(final IndexReader reader) throws IOException { Term[] prefixTerms = getPrefixTerms(terms.get(terms.size() - 1), reader); if (prefixTerms == null) { return new MatchNoDocsQuery(); }//from w ww . j a va 2s . c o m MultiPhraseQuery query = new MultiPhraseQuery(); for (int i = 0; i < terms.size() - 1; i++) { query.add(new Term(field, terms.get(i))); } query.add(prefixTerms); return query; }
From source file:com.leavesfly.lia.advsearching.MultiPhraseQueryTest.java
License:Apache License
public void testBasic() throws Exception { MultiPhraseQuery query = new MultiPhraseQuery(); query.add(new Term[] { // #A new Term("field", "quick"), // #A new Term("field", "fast") // #A });/* w w w. jav a 2 s.co m*/ query.add(new Term("field", "fox")); // #B System.out.println(query); TopDocs hits = searcher.search(query, 10); assertEquals("fast fox match", 1, hits.totalHits); query.setSlop(1); hits = searcher.search(query, 10); assertEquals("both match", 2, hits.totalHits); }
From source file:com.nearinfinity.blur.lucene.search.SuperParser.java
License:Apache License
@Override protected MultiPhraseQuery newMultiPhraseQuery() { return new MultiPhraseQuery() { private static final long serialVersionUID = 2743009696906520410L; @Override//from w w w .j a va 2 s . c o m public void add(Term[] terms, int position) { super.add(terms, position); for (Term term : terms) { addField(this, term.field()); } } }; }
From source file:com.zimbra.cs.index.AbstractIndexStoreTest.java
License:Open Source License
@Test public void multiPhraseQuery() throws Exception { ZimbraLog.test.debug("--->TEST multiPhraseQuery"); Mailbox mbox = MailboxManager.getInstance().getMailboxByAccountId(MockProvisioning.DEFAULT_ACCOUNT_ID); createContact(mbox, "Non", "Match", "nOn.MaTchiNg@zimbra.com"); Contact contact1 = createContact(mbox, "Paul", "AA", "aa@example.net", "Software Development Engineer"); createContact(mbox, "Jane", "BB", "bb@example.net", "Software Planning Engineer"); Contact contact2 = createContact(mbox, "Peter", "CC", "cc@example.net", "Software Dev Engineer"); createContact(mbox, "Avril", "DD", "dd@example.net", "Software Architectural Engineer"); Contact contact3 = createContact(mbox, "Leo", "EE", "ee@example.net", "Software Developer Engineer"); Contact contact4 = createContact(mbox, "Wow", "DD", "dd@example.net", "Softly Development Engineer"); createContact(mbox, "Given", "Surname", "GiV.SurN@zimbra.com"); mbox.index.indexDeferredItems(); // Make sure all indexing has been done IndexStore index = mbox.index.getIndexStore(); ZimbraIndexSearcher searcher = index.openSearcher(); MultiPhraseQuery pquery = new MultiPhraseQuery(); // Lower case required for each term for Lucene Term[] firstWords = { new Term(LuceneFields.L_CONTENT, "softly"), new Term(LuceneFields.L_CONTENT, "software") }; pquery.add(firstWords);/*from w w w. j a v a 2 s.c o m*/ Term[] secondWords = { new Term(LuceneFields.L_CONTENT, "dev"), new Term(LuceneFields.L_CONTENT, "development"), new Term(LuceneFields.L_CONTENT, "developer") }; pquery.add(secondWords); pquery.add(new Term(LuceneFields.L_CONTENT, "engineer")); ZimbraTopDocs result = searcher.search(pquery, 100); Assert.assertNotNull("searcher.search result object", result); ZimbraLog.test.debug("Result for search [hits=%d]:%s", result.getTotalHits(), result.toString()); Assert.assertEquals("Number of hits", 4, result.getTotalHits()); List<String> expecteds = Lists.newArrayList(); List<String> matches = Lists.newArrayList(); matches.add(getBlobIdForResultDoc(searcher, result, 0)); matches.add(getBlobIdForResultDoc(searcher, result, 1)); matches.add(getBlobIdForResultDoc(searcher, result, 2)); matches.add(getBlobIdForResultDoc(searcher, result, 3)); expecteds.add(String.valueOf(contact1.getId())); expecteds.add(String.valueOf(contact2.getId())); expecteds.add(String.valueOf(contact3.getId())); expecteds.add(String.valueOf(contact4.getId())); Collections.sort(matches); Collections.sort(expecteds); for (int ndx = 0; ndx < 4; ndx++) { Assert.assertEquals("Match Blob ID", expecteds.get(0), matches.get(0)); } }
From source file:com.zimbra.cs.index.LuceneQueryOperation.java
License:Open Source License
private Query expandLazyMultiPhraseQuery(Query query) throws IOException { if (query instanceof LazyMultiPhraseQuery) { LazyMultiPhraseQuery lazy = (LazyMultiPhraseQuery) query; int max = LC.zimbra_index_wildcard_max_terms_expanded.intValue(); MultiPhraseQuery mquery = new MultiPhraseQuery(); for (Term[] terms : lazy.getTermArrays()) { if (terms.length != 1) { mquery.add(terms);//from w ww . java 2s . c om continue; } Term base = terms[0]; if (!lazy.expand.contains(base)) { mquery.add(terms); continue; } List<Term> expanded = Lists.newArrayList(); TermFieldEnumeration itr = searcher.getIndexReader().getTermsForField(base.field(), base.text()); try { while (itr.hasMoreElements()) { BrowseTerm term = itr.nextElement(); if (term != null && term.getText().startsWith(base.text())) { if (expanded.size() >= max) { // too many terms expanded break; } expanded.add(new Term(base.field(), term.getText())); } else { break; } } } finally { Closeables.closeQuietly(itr); } if (expanded.isEmpty()) { return null; } else { mquery.add(expanded.toArray(new Term[expanded.size()])); } } return mquery; } else if (query instanceof BooleanQuery) { ListIterator<BooleanClause> itr = ((BooleanQuery) query).clauses().listIterator(); while (itr.hasNext()) { BooleanClause clause = itr.next(); Query result = expandLazyMultiPhraseQuery(clause.getQuery()); if (result == null) { if (clause.isRequired()) { return null; } else { itr.remove(); } } else if (result != clause.getQuery()) { clause.setQuery(result); } } return ((BooleanQuery) query).clauses().isEmpty() ? null : query; } else { return query; } }
From source file:de.powerstaff.business.service.impl.GoogleStyleQueryParser.java
License:Open Source License
protected void addPhraseQuery(String aTerm, BooleanQuery aQuery, String aField, Analyzer aAnalyzer) throws IOException { MultiPhraseQuery thePhraseQuery = new MultiPhraseQuery(); TokenStream theTokenStream = aAnalyzer.tokenStream(aField, new StringReader(aTerm)); while (theTokenStream.incrementToken()) { TermAttribute theTermAttribute = theTokenStream.getAttribute(TermAttribute.class); String theTokenText = theTermAttribute.term(); Term theTerm = new Term(aField, theTokenText); if (!isWildcardTerm(theTokenText)) { thePhraseQuery.add(theTerm); } else {/*from www .j av a2 s . c om*/ Term theWildcardTerm = new Term(theTerm.field(), getCorrectedWildcardTerm(theTerm.text())); WildcardTermEnum theEnum = new WildcardTermEnum(reader, theWildcardTerm); try { List<Term> theTerms = new ArrayList<Term>(); do { theTerms.add(theEnum.term()); } while (theEnum.next()); thePhraseQuery.add(theTerms.toArray(new Term[0])); } finally { theEnum.close(); } } } aQuery.add(thePhraseQuery, Occur.MUST); }
From source file:org.alfresco.repo.search.impl.lucene.AbstractLuceneQueryParser.java
License:Open Source License
@SuppressWarnings("unchecked") protected Query getFieldQueryImpl(String field, String queryText, AnalysisMode analysisMode, LuceneFunction luceneFunction) throws ParseException { // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or noth // TODO: Untokenised columns with functions require special handling if (luceneFunction != LuceneFunction.FIELD) { throw new UnsupportedOperationException( "Field queries are not supported on lucene functions (UPPER, LOWER, etc)"); }// ww w . ja v a2 s.c o m // if the incoming string already has a language identifier we strip it iff and addit back on again String localePrefix = ""; String toTokenise = queryText; if (queryText.startsWith("{")) { int position = queryText.indexOf("}"); String language = queryText.substring(0, position + 1); Locale locale = new Locale(queryText.substring(1, position)); String token = queryText.substring(position + 1); boolean found = false; if (!locale.toString().isEmpty()) { for (Locale current : Locale.getAvailableLocales()) { if (current.toString().equalsIgnoreCase(locale.toString())) { found = true; break; } } } if (found) { localePrefix = language; toTokenise = token; } else { toTokenise = token; } } String testText = toTokenise; boolean requiresMLTokenDuplication = false; String localeString = null; if (field.startsWith(PROPERTY_FIELD_PREFIX) && (localePrefix.length() == 0)) { if ((queryText.length() > 0) && (queryText.charAt(0) == '\u0000')) { int position = queryText.indexOf("\u0000", 1); testText = queryText.substring(position + 1); requiresMLTokenDuplication = true; localeString = queryText.substring(1, position); } } // find the positions of any escaped * and ? and ignore them Set<Integer> wildcardPoistions = getWildcardPositions(testText); TokenStream source; if ((localePrefix.length() == 0) || (wildcardPoistions.size() > 0) || (analysisMode == AnalysisMode.IDENTIFIER)) { source = getAnalyzer().tokenStream(field, new StringReader(toTokenise), analysisMode); } else { source = getAnalyzer().tokenStream(field, new StringReader( "\u0000" + localePrefix.substring(1, localePrefix.length() - 1) + "\u0000" + toTokenise), analysisMode); localePrefix = ""; } ArrayList<org.apache.lucene.analysis.Token> list = new ArrayList<org.apache.lucene.analysis.Token>(); org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token(); org.apache.lucene.analysis.Token nextToken; int positionCount = 0; boolean severalTokensAtSamePosition = false; while (true) { try { nextToken = source.next(reusableToken); } catch (IOException e) { nextToken = null; } if (nextToken == null) break; list.add((org.apache.lucene.analysis.Token) nextToken.clone()); if (nextToken.getPositionIncrement() != 0) positionCount += nextToken.getPositionIncrement(); else severalTokensAtSamePosition = true; } try { source.close(); } catch (IOException e) { // ignore } // add any alpha numeric wildcards that have been missed // Fixes most stop word and wild card issues for (int index = 0; index < testText.length(); index++) { char current = testText.charAt(index); if (((current == '*') || (current == '?')) && wildcardPoistions.contains(index)) { StringBuilder pre = new StringBuilder(10); if (index == 0) { // "*" and "?" at the start boolean found = false; for (int j = 0; j < list.size(); j++) { org.apache.lucene.analysis.Token test = list.get(j); if ((test.startOffset() <= 0) && (0 < test.endOffset())) { found = true; break; } } if (!found && (testText.length() == 1)) { // Add new token followed by * not given by the tokeniser org.apache.lucene.analysis.Token newToken = new org.apache.lucene.analysis.Token(0, 0); newToken.setTermBuffer(""); newToken.setType("ALPHANUM"); if (requiresMLTokenDuplication) { Locale locale = I18NUtil.parseLocale(localeString); MLAnalysisMode mlAnalysisMode = searchParameters.getMlAnalaysisMode() == null ? defaultSearchMLAnalysisMode : searchParameters.getMlAnalaysisMode(); MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, mlAnalysisMode); Iterator<org.apache.lucene.analysis.Token> it = duplicator.buildIterator(newToken); if (it != null) { int count = 0; while (it.hasNext()) { list.add(it.next()); count++; if (count > 1) { severalTokensAtSamePosition = true; } } } } // content else { list.add(newToken); } } } else if (index > 0) { // Add * and ? back into any tokens from which it has been removed boolean tokenFound = false; for (int j = 0; j < list.size(); j++) { org.apache.lucene.analysis.Token test = list.get(j); if ((test.startOffset() <= index) && (index < test.endOffset())) { if (requiresMLTokenDuplication) { String termText = new String(test.termBuffer(), 0, test.termLength()); int position = termText.indexOf("}"); String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); if (index >= test.startOffset() + token.length()) { test.setTermBuffer(language + token + current); } } else { if (index >= test.startOffset() + test.termLength()) { test.setTermBuffer(test.term() + current); } } tokenFound = true; break; } } if (!tokenFound) { for (int i = index - 1; i >= 0; i--) { char c = testText.charAt(i); if (Character.isLetterOrDigit(c)) { boolean found = false; for (int j = 0; j < list.size(); j++) { org.apache.lucene.analysis.Token test = list.get(j); if ((test.startOffset() <= i) && (i < test.endOffset())) { found = true; break; } } if (found) { break; } else { pre.insert(0, c); } } else { break; } } if (pre.length() > 0) { // Add new token followed by * not given by the tokeniser org.apache.lucene.analysis.Token newToken = new org.apache.lucene.analysis.Token( index - pre.length(), index); newToken.setTermBuffer(pre.toString()); newToken.setType("ALPHANUM"); if (requiresMLTokenDuplication) { Locale locale = I18NUtil.parseLocale(localeString); MLAnalysisMode mlAnalysisMode = searchParameters.getMlAnalaysisMode() == null ? defaultSearchMLAnalysisMode : searchParameters.getMlAnalaysisMode(); MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, mlAnalysisMode); Iterator<org.apache.lucene.analysis.Token> it = duplicator.buildIterator(newToken); if (it != null) { int count = 0; while (it.hasNext()) { list.add(it.next()); count++; if (count > 1) { severalTokensAtSamePosition = true; } } } } // content else { list.add(newToken); } } } } StringBuilder post = new StringBuilder(10); if (index > 0) { for (int i = index + 1; i < testText.length(); i++) { char c = testText.charAt(i); if (Character.isLetterOrDigit(c)) { boolean found = false; for (int j = 0; j < list.size(); j++) { org.apache.lucene.analysis.Token test = list.get(j); if ((test.startOffset() <= i) && (i < test.endOffset())) { found = true; break; } } if (found) { break; } else { post.append(c); } } else { break; } } if (post.length() > 0) { // Add new token followed by * not given by the tokeniser org.apache.lucene.analysis.Token newToken = new org.apache.lucene.analysis.Token(index + 1, index + 1 + post.length()); newToken.setTermBuffer(post.toString()); newToken.setType("ALPHANUM"); if (requiresMLTokenDuplication) { Locale locale = I18NUtil.parseLocale(localeString); MLAnalysisMode mlAnalysisMode = searchParameters.getMlAnalaysisMode() == null ? defaultSearchMLAnalysisMode : searchParameters.getMlAnalaysisMode(); MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, mlAnalysisMode); Iterator<org.apache.lucene.analysis.Token> it = duplicator.buildIterator(newToken); if (it != null) { int count = 0; while (it.hasNext()) { list.add(it.next()); count++; if (count > 1) { severalTokensAtSamePosition = true; } } } } // content else { list.add(newToken); } } } } } Collections.sort(list, new Comparator<org.apache.lucene.analysis.Token>() { public int compare(Token o1, Token o2) { int dif = o1.startOffset() - o2.startOffset(); if (dif != 0) { return dif; } else { return o2.getPositionIncrement() - o1.getPositionIncrement(); } } }); // Combined * and ? based strings - should redo the tokeniser // Build tokens by position LinkedList<LinkedList<org.apache.lucene.analysis.Token>> tokensByPosition = new LinkedList<LinkedList<org.apache.lucene.analysis.Token>>(); LinkedList<org.apache.lucene.analysis.Token> currentList = null; for (org.apache.lucene.analysis.Token c : list) { if (c.getPositionIncrement() == 0) { if (currentList == null) { currentList = new LinkedList<org.apache.lucene.analysis.Token>(); tokensByPosition.add(currentList); } currentList.add(c); } else { currentList = new LinkedList<org.apache.lucene.analysis.Token>(); tokensByPosition.add(currentList); currentList.add(c); } } // Build all the token sequences and see which ones get strung together LinkedList<LinkedList<org.apache.lucene.analysis.Token>> allTokenSequences = new LinkedList<LinkedList<org.apache.lucene.analysis.Token>>(); for (LinkedList<org.apache.lucene.analysis.Token> tokensAtPosition : tokensByPosition) { if (allTokenSequences.size() == 0) { for (org.apache.lucene.analysis.Token t : tokensAtPosition) { LinkedList<org.apache.lucene.analysis.Token> newEntry = new LinkedList<org.apache.lucene.analysis.Token>(); newEntry.add(t); allTokenSequences.add(newEntry); } } else { LinkedList<LinkedList<org.apache.lucene.analysis.Token>> newAllTokeSequences = new LinkedList<LinkedList<org.apache.lucene.analysis.Token>>(); FOR_FIRST_TOKEN_AT_POSITION_ONLY: for (org.apache.lucene.analysis.Token t : tokensAtPosition) { boolean tokenFoundSequence = false; for (LinkedList<org.apache.lucene.analysis.Token> tokenSequence : allTokenSequences) { LinkedList<org.apache.lucene.analysis.Token> newEntry = new LinkedList<org.apache.lucene.analysis.Token>(); newEntry.addAll(tokenSequence); if (newEntry.getLast().endOffset() <= t.startOffset()) { newEntry.add(t); tokenFoundSequence = true; } newAllTokeSequences.add(newEntry); } if (false == tokenFoundSequence) { LinkedList<org.apache.lucene.analysis.Token> newEntry = new LinkedList<org.apache.lucene.analysis.Token>(); newEntry.add(t); newAllTokeSequences.add(newEntry); } // Limit the max number of permutations we consider if (newAllTokeSequences.size() > 64) { break FOR_FIRST_TOKEN_AT_POSITION_ONLY; } } allTokenSequences = newAllTokeSequences; } } // build the uniquie LinkedList<LinkedList<org.apache.lucene.analysis.Token>> fixedTokenSequences = new LinkedList<LinkedList<org.apache.lucene.analysis.Token>>(); for (LinkedList<org.apache.lucene.analysis.Token> tokenSequence : allTokenSequences) { LinkedList<org.apache.lucene.analysis.Token> fixedTokenSequence = new LinkedList<org.apache.lucene.analysis.Token>(); fixedTokenSequences.add(fixedTokenSequence); org.apache.lucene.analysis.Token replace = null; for (org.apache.lucene.analysis.Token c : tokenSequence) { if (replace == null) { StringBuilder prefix = new StringBuilder(); for (int i = c.startOffset() - 1; i >= 0; i--) { char test = testText.charAt(i); if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) { prefix.insert(0, test); } else { break; } } String pre = prefix.toString(); if (requiresMLTokenDuplication) { String termText = new String(c.termBuffer(), 0, c.termLength()); int position = termText.indexOf("}"); String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); replace = new org.apache.lucene.analysis.Token(c.startOffset() - pre.length(), c.endOffset()); replace.setTermBuffer(language + pre + token); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } else { String termText = new String(c.termBuffer(), 0, c.termLength()); replace = new org.apache.lucene.analysis.Token(c.startOffset() - pre.length(), c.endOffset()); replace.setTermBuffer(pre + termText); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } } else { StringBuilder prefix = new StringBuilder(); StringBuilder postfix = new StringBuilder(); StringBuilder builder = prefix; for (int i = c.startOffset() - 1; i >= replace.endOffset(); i--) { char test = testText.charAt(i); if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) { builder.insert(0, test); } else { builder = postfix; postfix.setLength(0); } } String pre = prefix.toString(); String post = postfix.toString(); // Does it bridge? if ((pre.length() > 0) && (replace.endOffset() + pre.length()) == c.startOffset()) { String termText = new String(c.termBuffer(), 0, c.termLength()); if (requiresMLTokenDuplication) { int position = termText.indexOf("}"); @SuppressWarnings("unused") String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); int oldPositionIncrement = replace.getPositionIncrement(); String replaceTermText = new String(replace.termBuffer(), 0, replace.termLength()); replace = new org.apache.lucene.analysis.Token(replace.startOffset(), c.endOffset()); replace.setTermBuffer(replaceTermText + pre + token); replace.setType(replace.type()); replace.setPositionIncrement(oldPositionIncrement); } else { int oldPositionIncrement = replace.getPositionIncrement(); String replaceTermText = new String(replace.termBuffer(), 0, replace.termLength()); replace = new org.apache.lucene.analysis.Token(replace.startOffset(), c.endOffset()); replace.setTermBuffer(replaceTermText + pre + termText); replace.setType(replace.type()); replace.setPositionIncrement(oldPositionIncrement); } } else { String termText = new String(c.termBuffer(), 0, c.termLength()); if (requiresMLTokenDuplication) { int position = termText.indexOf("}"); String language = termText.substring(0, position + 1); String token = termText.substring(position + 1); String replaceTermText = new String(replace.termBuffer(), 0, replace.termLength()); org.apache.lucene.analysis.Token last = new org.apache.lucene.analysis.Token( replace.startOffset(), replace.endOffset() + post.length()); last.setTermBuffer(replaceTermText + post); last.setType(replace.type()); last.setPositionIncrement(replace.getPositionIncrement()); fixedTokenSequence.add(last); replace = new org.apache.lucene.analysis.Token(c.startOffset() - pre.length(), c.endOffset()); replace.setTermBuffer(language + pre + token); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } else { String replaceTermText = new String(replace.termBuffer(), 0, replace.termLength()); org.apache.lucene.analysis.Token last = new org.apache.lucene.analysis.Token( replace.startOffset(), replace.endOffset() + post.length()); last.setTermBuffer(replaceTermText + post); last.setType(replace.type()); last.setPositionIncrement(replace.getPositionIncrement()); fixedTokenSequence.add(last); replace = new org.apache.lucene.analysis.Token(c.startOffset() - pre.length(), c.endOffset()); replace.setTermBuffer(pre + termText); replace.setType(c.type()); replace.setPositionIncrement(c.getPositionIncrement()); } } } } // finish last if (replace != null) { StringBuilder postfix = new StringBuilder(); if ((replace.endOffset() >= 0) && (replace.endOffset() < testText.length())) { for (int i = replace.endOffset(); i < testText.length(); i++) { char test = testText.charAt(i); if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) { postfix.append(test); } else { break; } } } String post = postfix.toString(); int oldPositionIncrement = replace.getPositionIncrement(); String replaceTermText = new String(replace.termBuffer(), 0, replace.termLength()); replace = new org.apache.lucene.analysis.Token(replace.startOffset(), replace.endOffset() + post.length()); replace.setTermBuffer(replaceTermText + post); replace.setType(replace.type()); replace.setPositionIncrement(oldPositionIncrement); fixedTokenSequence.add(replace); } } // rebuild fixed list ArrayList<org.apache.lucene.analysis.Token> fixed = new ArrayList<org.apache.lucene.analysis.Token>(); for (LinkedList<org.apache.lucene.analysis.Token> tokenSequence : fixedTokenSequences) { for (org.apache.lucene.analysis.Token token : tokenSequence) { fixed.add(token); } } // reorder by start position and increment Collections.sort(fixed, new Comparator<org.apache.lucene.analysis.Token>() { public int compare(Token o1, Token o2) { int dif = o1.startOffset() - o2.startOffset(); if (dif != 0) { return dif; } else { return o1.getPositionIncrement() - o2.getPositionIncrement(); } } }); // make sure we remove any tokens we have duplicated @SuppressWarnings("rawtypes") OrderedHashSet unique = new OrderedHashSet(); unique.addAll(fixed); fixed = new ArrayList<org.apache.lucene.analysis.Token>(unique); list = fixed; // add any missing locales back to the tokens if (localePrefix.length() > 0) { for (int j = 0; j < list.size(); j++) { org.apache.lucene.analysis.Token currentToken = list.get(j); String termText = new String(currentToken.termBuffer(), 0, currentToken.termLength()); currentToken.setTermBuffer(localePrefix + termText); } } if (list.size() == 0) return null; else if (list.size() == 1) { nextToken = list.get(0); String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength()); if (termText.contains("*") || termText.contains("?")) { return newWildcardQuery( new Term(field, getLowercaseExpandedTerms() ? termText.toLowerCase() : termText)); } else { return newTermQuery(new Term(field, termText)); } } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: BooleanQuery q = newBooleanQuery(true); for (int i = 0; i < list.size(); i++) { Query currentQuery; nextToken = list.get(i); String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength()); if (termText.contains("*") || termText.contains("?")) { currentQuery = newWildcardQuery(new Term(field, getLowercaseExpandedTerms() ? termText.toLowerCase() : termText)); } else { currentQuery = newTermQuery(new Term(field, termText)); } q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q; } // Consider if we can use a multi-phrase query (e.g for synonym use rather then WordDelimiterFilterFactory) else if (canUseMultiPhraseQuery(fixedTokenSequences)) { // phrase query: MultiPhraseQuery mpq = newMultiPhraseQuery(); mpq.setSlop(internalSlop); ArrayList<Term> multiTerms = new ArrayList<Term>(); int position = 0; for (int i = 0; i < list.size(); i++) { nextToken = list.get(i); String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength()); Term term = new Term(field, termText); if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { addWildcardTerms(multiTerms, term); } else { multiTerms.add(term); } if (nextToken.getPositionIncrement() > 0 && multiTerms.size() > 0) { if (getEnablePositionIncrements()) { mpq.add(multiTerms.toArray(new Term[0]), position); } else { mpq.add(multiTerms.toArray(new Term[0])); } checkTermCount(field, queryText, mpq); multiTerms.clear(); } position += nextToken.getPositionIncrement(); } if (getEnablePositionIncrements()) { if (multiTerms.size() > 0) { mpq.add(multiTerms.toArray(new Term[0]), position); } // else // { // mpq.add(new Term[] { new Term(field, "\u0000") }, position); // } } else { if (multiTerms.size() > 0) { mpq.add(multiTerms.toArray(new Term[0])); } // else // { // mpq.add(new Term[] { new Term(field, "\u0000") }); // } } checkTermCount(field, queryText, mpq); return mpq; } // Word delimiter factory and other odd things generate complex token patterns // Smart skip token sequences with small tokens that generate toomany wildcards // Fall back to the larger pattern // e.g Site1* will not do (S ite 1*) or (Site 1*) if 1* matches too much (S ite1*) and (Site1*) will still be OK // If we skip all (for just 1* in the input) this is still an issue. else { boolean skippedTokens = false; BooleanQuery q = newBooleanQuery(true); TOKEN_SEQUENCE: for (LinkedList<org.apache.lucene.analysis.Token> tokenSequence : fixedTokenSequences) { // phrase query: MultiPhraseQuery mpq = newMultiPhraseQuery(); mpq.setSlop(internalSlop); int position = 0; for (int i = 0; i < tokenSequence.size(); i++) { nextToken = (org.apache.lucene.analysis.Token) tokenSequence.get(i); String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength()); Term term = new Term(field, termText); if (getEnablePositionIncrements()) { if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { mpq.add(getMatchingTerms(field, term), position); } else { mpq.add(new Term[] { term }, position); } if (exceedsTermCount(mpq)) { // We could duplicate the token sequence without the failing wildcard expansion and try again ?? skippedTokens = true; continue TOKEN_SEQUENCE; } if (nextToken.getPositionIncrement() > 0) { position += nextToken.getPositionIncrement(); } else { position++; } } else { if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { mpq.add(getMatchingTerms(field, term)); } else { mpq.add(term); } if (exceedsTermCount(mpq)) { skippedTokens = true; continue TOKEN_SEQUENCE; } } } q.add(mpq, BooleanClause.Occur.SHOULD); } if (skippedTokens && (q.clauses().size() == 0)) { throw new LuceneQueryParserException( "Query skipped all token sequences as wildcards generated too many clauses: " + field + " " + queryText); } return q; } } else { MultiPhraseQuery q = new MultiPhraseQuery(); q.setSlop(internalSlop); int position = 0; for (int i = 0; i < list.size(); i++) { nextToken = list.get(i); String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength()); Term term = new Term(field, termText); if (getEnablePositionIncrements()) { if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { q.add(getMatchingTerms(field, term), position); } else { q.add(new Term[] { term }, position); } checkTermCount(field, queryText, q); if (nextToken.getPositionIncrement() > 0) { position += nextToken.getPositionIncrement(); } else { position++; } } else { if ((termText != null) && (termText.contains("*") || termText.contains("?"))) { q.add(getMatchingTerms(field, term)); } else { q.add(term); } checkTermCount(field, queryText, q); } } return q; } } }
From source file:org.apache.blur.lucene.search.BlurQueryParser.java
License:Apache License
@Override protected MultiPhraseQuery newMultiPhraseQuery() { return new MultiPhraseQuery() { @Override/*from w ww.j av a 2 s. c o m*/ public void add(Term[] terms, int position) { super.add(terms, position); for (Term term : terms) { String resolvedField = _fieldManager.resolveField(term.field()); customQueryCheck(resolvedField); addField(this, resolvedField); } } }; }