Example usage for org.apache.lucene.search MultiPhraseQuery MultiPhraseQuery

List of usage examples for org.apache.lucene.search MultiPhraseQuery MultiPhraseQuery

Introduction

In this page you can find the example usage for org.apache.lucene.search MultiPhraseQuery MultiPhraseQuery.

Prototype

MultiPhraseQuery

Source Link

Usage

From source file:aos.lucene.search.advanced.MultiPhraseQueryTest.java

License:Apache License

public void testBasic() throws Exception {
    MultiPhraseQuery query = new MultiPhraseQuery();
    query.add(new Term[] { // #A
            new Term("field", "quick"), // #A
            new Term("field", "fast") // #A
    });/*from ww  w.ja  v a 2 s  .  c om*/
    query.add(new Term("field", "fox")); // #B
    LOGGER.info(query);

    TopDocs hits = searcher.search(query, 10);
    assertEquals("fast fox match", 1, hits.totalHits);

    query.setSlop(1);
    hits = searcher.search(query, 10);
    assertEquals("both match", 2, hits.totalHits);
}

From source file:com.bewsia.script.safe.lucene.SEntity.java

License:Open Source License

public MultiPhraseQuery newMultiPhraseQuery() {
    return new MultiPhraseQuery();
}

From source file:com.greplin.lucene.query.PhrasePrefixQuery.java

License:Apache License

@Override
public final Query rewrite(final IndexReader reader) throws IOException {
    Term[] prefixTerms = getPrefixTerms(terms.get(terms.size() - 1), reader);
    if (prefixTerms == null) {
        return new MatchNoDocsQuery();
    }//from   w ww  .  j a  va 2s  . c  o  m

    MultiPhraseQuery query = new MultiPhraseQuery();
    for (int i = 0; i < terms.size() - 1; i++) {
        query.add(new Term(field, terms.get(i)));
    }
    query.add(prefixTerms);
    return query;
}

From source file:com.leavesfly.lia.advsearching.MultiPhraseQueryTest.java

License:Apache License

public void testBasic() throws Exception {
    MultiPhraseQuery query = new MultiPhraseQuery();
    query.add(new Term[] { // #A
            new Term("field", "quick"), // #A
            new Term("field", "fast") // #A
    });/*  w w w.  jav a 2 s.co m*/
    query.add(new Term("field", "fox")); // #B
    System.out.println(query);

    TopDocs hits = searcher.search(query, 10);
    assertEquals("fast fox match", 1, hits.totalHits);

    query.setSlop(1);
    hits = searcher.search(query, 10);
    assertEquals("both match", 2, hits.totalHits);
}

From source file:com.nearinfinity.blur.lucene.search.SuperParser.java

License:Apache License

@Override
protected MultiPhraseQuery newMultiPhraseQuery() {
    return new MultiPhraseQuery() {
        private static final long serialVersionUID = 2743009696906520410L;

        @Override//from w  w w  .j  a va 2 s  . c o  m
        public void add(Term[] terms, int position) {
            super.add(terms, position);
            for (Term term : terms) {
                addField(this, term.field());
            }
        }
    };
}

From source file:com.zimbra.cs.index.AbstractIndexStoreTest.java

License:Open Source License

@Test
public void multiPhraseQuery() throws Exception {
    ZimbraLog.test.debug("--->TEST multiPhraseQuery");
    Mailbox mbox = MailboxManager.getInstance().getMailboxByAccountId(MockProvisioning.DEFAULT_ACCOUNT_ID);
    createContact(mbox, "Non", "Match", "nOn.MaTchiNg@zimbra.com");
    Contact contact1 = createContact(mbox, "Paul", "AA", "aa@example.net", "Software Development Engineer");
    createContact(mbox, "Jane", "BB", "bb@example.net", "Software Planning Engineer");
    Contact contact2 = createContact(mbox, "Peter", "CC", "cc@example.net", "Software Dev Engineer");
    createContact(mbox, "Avril", "DD", "dd@example.net", "Software Architectural Engineer");
    Contact contact3 = createContact(mbox, "Leo", "EE", "ee@example.net", "Software Developer Engineer");
    Contact contact4 = createContact(mbox, "Wow", "DD", "dd@example.net", "Softly Development Engineer");
    createContact(mbox, "Given", "Surname", "GiV.SurN@zimbra.com");
    mbox.index.indexDeferredItems(); // Make sure all indexing has been done

    IndexStore index = mbox.index.getIndexStore();
    ZimbraIndexSearcher searcher = index.openSearcher();
    MultiPhraseQuery pquery = new MultiPhraseQuery();
    // Lower case required for each term for Lucene
    Term[] firstWords = { new Term(LuceneFields.L_CONTENT, "softly"),
            new Term(LuceneFields.L_CONTENT, "software") };
    pquery.add(firstWords);/*from w  w  w. j a v a 2  s.c  o m*/
    Term[] secondWords = { new Term(LuceneFields.L_CONTENT, "dev"),
            new Term(LuceneFields.L_CONTENT, "development"), new Term(LuceneFields.L_CONTENT, "developer") };
    pquery.add(secondWords);
    pquery.add(new Term(LuceneFields.L_CONTENT, "engineer"));
    ZimbraTopDocs result = searcher.search(pquery, 100);
    Assert.assertNotNull("searcher.search result object", result);
    ZimbraLog.test.debug("Result for search [hits=%d]:%s", result.getTotalHits(), result.toString());
    Assert.assertEquals("Number of hits", 4, result.getTotalHits());
    List<String> expecteds = Lists.newArrayList();
    List<String> matches = Lists.newArrayList();
    matches.add(getBlobIdForResultDoc(searcher, result, 0));
    matches.add(getBlobIdForResultDoc(searcher, result, 1));
    matches.add(getBlobIdForResultDoc(searcher, result, 2));
    matches.add(getBlobIdForResultDoc(searcher, result, 3));
    expecteds.add(String.valueOf(contact1.getId()));
    expecteds.add(String.valueOf(contact2.getId()));
    expecteds.add(String.valueOf(contact3.getId()));
    expecteds.add(String.valueOf(contact4.getId()));
    Collections.sort(matches);
    Collections.sort(expecteds);
    for (int ndx = 0; ndx < 4; ndx++) {
        Assert.assertEquals("Match Blob ID", expecteds.get(0), matches.get(0));
    }
}

From source file:com.zimbra.cs.index.LuceneQueryOperation.java

License:Open Source License

private Query expandLazyMultiPhraseQuery(Query query) throws IOException {
    if (query instanceof LazyMultiPhraseQuery) {
        LazyMultiPhraseQuery lazy = (LazyMultiPhraseQuery) query;
        int max = LC.zimbra_index_wildcard_max_terms_expanded.intValue();
        MultiPhraseQuery mquery = new MultiPhraseQuery();
        for (Term[] terms : lazy.getTermArrays()) {
            if (terms.length != 1) {
                mquery.add(terms);//from   w  ww .  java  2s .  c  om
                continue;
            }
            Term base = terms[0];
            if (!lazy.expand.contains(base)) {
                mquery.add(terms);
                continue;
            }
            List<Term> expanded = Lists.newArrayList();
            TermFieldEnumeration itr = searcher.getIndexReader().getTermsForField(base.field(), base.text());
            try {
                while (itr.hasMoreElements()) {
                    BrowseTerm term = itr.nextElement();
                    if (term != null && term.getText().startsWith(base.text())) {
                        if (expanded.size() >= max) { // too many terms expanded
                            break;
                        }
                        expanded.add(new Term(base.field(), term.getText()));
                    } else {
                        break;
                    }
                }
            } finally {
                Closeables.closeQuietly(itr);
            }
            if (expanded.isEmpty()) {
                return null;
            } else {
                mquery.add(expanded.toArray(new Term[expanded.size()]));
            }
        }
        return mquery;
    } else if (query instanceof BooleanQuery) {
        ListIterator<BooleanClause> itr = ((BooleanQuery) query).clauses().listIterator();
        while (itr.hasNext()) {
            BooleanClause clause = itr.next();
            Query result = expandLazyMultiPhraseQuery(clause.getQuery());
            if (result == null) {
                if (clause.isRequired()) {
                    return null;
                } else {
                    itr.remove();
                }
            } else if (result != clause.getQuery()) {
                clause.setQuery(result);
            }
        }
        return ((BooleanQuery) query).clauses().isEmpty() ? null : query;
    } else {
        return query;
    }
}

From source file:de.powerstaff.business.service.impl.GoogleStyleQueryParser.java

License:Open Source License

protected void addPhraseQuery(String aTerm, BooleanQuery aQuery, String aField, Analyzer aAnalyzer)
        throws IOException {

    MultiPhraseQuery thePhraseQuery = new MultiPhraseQuery();

    TokenStream theTokenStream = aAnalyzer.tokenStream(aField, new StringReader(aTerm));
    while (theTokenStream.incrementToken()) {

        TermAttribute theTermAttribute = theTokenStream.getAttribute(TermAttribute.class);

        String theTokenText = theTermAttribute.term();

        Term theTerm = new Term(aField, theTokenText);

        if (!isWildcardTerm(theTokenText)) {
            thePhraseQuery.add(theTerm);
        } else {/*from  www .j av  a2  s  .  c  om*/
            Term theWildcardTerm = new Term(theTerm.field(), getCorrectedWildcardTerm(theTerm.text()));
            WildcardTermEnum theEnum = new WildcardTermEnum(reader, theWildcardTerm);
            try {
                List<Term> theTerms = new ArrayList<Term>();
                do {
                    theTerms.add(theEnum.term());
                } while (theEnum.next());
                thePhraseQuery.add(theTerms.toArray(new Term[0]));
            } finally {
                theEnum.close();
            }
        }
    }

    aQuery.add(thePhraseQuery, Occur.MUST);
}

From source file:org.alfresco.repo.search.impl.lucene.AbstractLuceneQueryParser.java

License:Open Source License

@SuppressWarnings("unchecked")
protected Query getFieldQueryImpl(String field, String queryText, AnalysisMode analysisMode,
        LuceneFunction luceneFunction) throws ParseException {
    // Use the analyzer to get all the tokens, and then build a TermQuery,
    // PhraseQuery, or noth

    // TODO: Untokenised columns with functions require special handling

    if (luceneFunction != LuceneFunction.FIELD) {
        throw new UnsupportedOperationException(
                "Field queries are not supported on lucene functions (UPPER, LOWER, etc)");
    }//  ww w .  ja v  a2 s.c o  m

    // if the incoming string already has a language identifier we strip it iff and addit back on again

    String localePrefix = "";

    String toTokenise = queryText;

    if (queryText.startsWith("{")) {
        int position = queryText.indexOf("}");
        String language = queryText.substring(0, position + 1);
        Locale locale = new Locale(queryText.substring(1, position));
        String token = queryText.substring(position + 1);
        boolean found = false;
        if (!locale.toString().isEmpty()) {
            for (Locale current : Locale.getAvailableLocales()) {
                if (current.toString().equalsIgnoreCase(locale.toString())) {
                    found = true;
                    break;
                }
            }
        }
        if (found) {
            localePrefix = language;
            toTokenise = token;
        } else {
            toTokenise = token;
        }
    }

    String testText = toTokenise;
    boolean requiresMLTokenDuplication = false;
    String localeString = null;
    if (field.startsWith(PROPERTY_FIELD_PREFIX) && (localePrefix.length() == 0)) {
        if ((queryText.length() > 0) && (queryText.charAt(0) == '\u0000')) {
            int position = queryText.indexOf("\u0000", 1);
            testText = queryText.substring(position + 1);
            requiresMLTokenDuplication = true;
            localeString = queryText.substring(1, position);
        }
    }

    // find the positions of any escaped * and ? and ignore them

    Set<Integer> wildcardPoistions = getWildcardPositions(testText);

    TokenStream source;
    if ((localePrefix.length() == 0) || (wildcardPoistions.size() > 0)
            || (analysisMode == AnalysisMode.IDENTIFIER)) {
        source = getAnalyzer().tokenStream(field, new StringReader(toTokenise), analysisMode);
    } else {
        source = getAnalyzer().tokenStream(field, new StringReader(
                "\u0000" + localePrefix.substring(1, localePrefix.length() - 1) + "\u0000" + toTokenise),
                analysisMode);
        localePrefix = "";
    }

    ArrayList<org.apache.lucene.analysis.Token> list = new ArrayList<org.apache.lucene.analysis.Token>();
    org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token();
    org.apache.lucene.analysis.Token nextToken;
    int positionCount = 0;
    boolean severalTokensAtSamePosition = false;

    while (true) {
        try {
            nextToken = source.next(reusableToken);
        } catch (IOException e) {
            nextToken = null;
        }
        if (nextToken == null)
            break;
        list.add((org.apache.lucene.analysis.Token) nextToken.clone());
        if (nextToken.getPositionIncrement() != 0)
            positionCount += nextToken.getPositionIncrement();
        else
            severalTokensAtSamePosition = true;
    }
    try {
        source.close();
    } catch (IOException e) {
        // ignore
    }

    // add any alpha numeric wildcards that have been missed
    // Fixes most stop word and wild card issues

    for (int index = 0; index < testText.length(); index++) {
        char current = testText.charAt(index);
        if (((current == '*') || (current == '?')) && wildcardPoistions.contains(index)) {
            StringBuilder pre = new StringBuilder(10);
            if (index == 0) {
                // "*" and "?" at the start

                boolean found = false;
                for (int j = 0; j < list.size(); j++) {
                    org.apache.lucene.analysis.Token test = list.get(j);
                    if ((test.startOffset() <= 0) && (0 < test.endOffset())) {
                        found = true;
                        break;
                    }
                }
                if (!found && (testText.length() == 1)) {
                    // Add new token followed by * not given by the tokeniser
                    org.apache.lucene.analysis.Token newToken = new org.apache.lucene.analysis.Token(0, 0);
                    newToken.setTermBuffer("");
                    newToken.setType("ALPHANUM");
                    if (requiresMLTokenDuplication) {
                        Locale locale = I18NUtil.parseLocale(localeString);
                        MLAnalysisMode mlAnalysisMode = searchParameters.getMlAnalaysisMode() == null
                                ? defaultSearchMLAnalysisMode
                                : searchParameters.getMlAnalaysisMode();
                        MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, mlAnalysisMode);
                        Iterator<org.apache.lucene.analysis.Token> it = duplicator.buildIterator(newToken);
                        if (it != null) {
                            int count = 0;
                            while (it.hasNext()) {
                                list.add(it.next());
                                count++;
                                if (count > 1) {
                                    severalTokensAtSamePosition = true;
                                }
                            }
                        }
                    }
                    // content
                    else {
                        list.add(newToken);
                    }
                }
            } else if (index > 0) {
                // Add * and ? back into any tokens from which it has been removed

                boolean tokenFound = false;
                for (int j = 0; j < list.size(); j++) {
                    org.apache.lucene.analysis.Token test = list.get(j);
                    if ((test.startOffset() <= index) && (index < test.endOffset())) {
                        if (requiresMLTokenDuplication) {
                            String termText = new String(test.termBuffer(), 0, test.termLength());
                            int position = termText.indexOf("}");
                            String language = termText.substring(0, position + 1);
                            String token = termText.substring(position + 1);
                            if (index >= test.startOffset() + token.length()) {
                                test.setTermBuffer(language + token + current);
                            }
                        } else {
                            if (index >= test.startOffset() + test.termLength()) {
                                test.setTermBuffer(test.term() + current);
                            }
                        }
                        tokenFound = true;
                        break;
                    }
                }

                if (!tokenFound) {
                    for (int i = index - 1; i >= 0; i--) {
                        char c = testText.charAt(i);
                        if (Character.isLetterOrDigit(c)) {
                            boolean found = false;
                            for (int j = 0; j < list.size(); j++) {
                                org.apache.lucene.analysis.Token test = list.get(j);
                                if ((test.startOffset() <= i) && (i < test.endOffset())) {
                                    found = true;
                                    break;
                                }
                            }
                            if (found) {
                                break;
                            } else {
                                pre.insert(0, c);
                            }
                        } else {
                            break;
                        }
                    }
                    if (pre.length() > 0) {
                        // Add new token followed by * not given by the tokeniser
                        org.apache.lucene.analysis.Token newToken = new org.apache.lucene.analysis.Token(
                                index - pre.length(), index);
                        newToken.setTermBuffer(pre.toString());
                        newToken.setType("ALPHANUM");
                        if (requiresMLTokenDuplication) {
                            Locale locale = I18NUtil.parseLocale(localeString);
                            MLAnalysisMode mlAnalysisMode = searchParameters.getMlAnalaysisMode() == null
                                    ? defaultSearchMLAnalysisMode
                                    : searchParameters.getMlAnalaysisMode();
                            MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, mlAnalysisMode);
                            Iterator<org.apache.lucene.analysis.Token> it = duplicator.buildIterator(newToken);
                            if (it != null) {
                                int count = 0;
                                while (it.hasNext()) {
                                    list.add(it.next());
                                    count++;
                                    if (count > 1) {
                                        severalTokensAtSamePosition = true;
                                    }
                                }
                            }
                        }
                        // content
                        else {
                            list.add(newToken);
                        }
                    }
                }
            }

            StringBuilder post = new StringBuilder(10);
            if (index > 0) {
                for (int i = index + 1; i < testText.length(); i++) {
                    char c = testText.charAt(i);
                    if (Character.isLetterOrDigit(c)) {
                        boolean found = false;
                        for (int j = 0; j < list.size(); j++) {
                            org.apache.lucene.analysis.Token test = list.get(j);
                            if ((test.startOffset() <= i) && (i < test.endOffset())) {
                                found = true;
                                break;
                            }
                        }
                        if (found) {
                            break;
                        } else {
                            post.append(c);
                        }
                    } else {
                        break;
                    }
                }
                if (post.length() > 0) {
                    // Add new token followed by * not given by the tokeniser
                    org.apache.lucene.analysis.Token newToken = new org.apache.lucene.analysis.Token(index + 1,
                            index + 1 + post.length());
                    newToken.setTermBuffer(post.toString());
                    newToken.setType("ALPHANUM");
                    if (requiresMLTokenDuplication) {
                        Locale locale = I18NUtil.parseLocale(localeString);
                        MLAnalysisMode mlAnalysisMode = searchParameters.getMlAnalaysisMode() == null
                                ? defaultSearchMLAnalysisMode
                                : searchParameters.getMlAnalaysisMode();
                        MLTokenDuplicator duplicator = new MLTokenDuplicator(locale, mlAnalysisMode);
                        Iterator<org.apache.lucene.analysis.Token> it = duplicator.buildIterator(newToken);
                        if (it != null) {
                            int count = 0;
                            while (it.hasNext()) {
                                list.add(it.next());
                                count++;
                                if (count > 1) {
                                    severalTokensAtSamePosition = true;
                                }
                            }
                        }
                    }
                    // content
                    else {
                        list.add(newToken);
                    }
                }
            }

        }
    }

    Collections.sort(list, new Comparator<org.apache.lucene.analysis.Token>() {

        public int compare(Token o1, Token o2) {
            int dif = o1.startOffset() - o2.startOffset();
            if (dif != 0) {
                return dif;
            } else {
                return o2.getPositionIncrement() - o1.getPositionIncrement();
            }
        }
    });

    // Combined * and ? based strings - should redo the tokeniser

    // Build tokens by position

    LinkedList<LinkedList<org.apache.lucene.analysis.Token>> tokensByPosition = new LinkedList<LinkedList<org.apache.lucene.analysis.Token>>();
    LinkedList<org.apache.lucene.analysis.Token> currentList = null;
    for (org.apache.lucene.analysis.Token c : list) {
        if (c.getPositionIncrement() == 0) {
            if (currentList == null) {
                currentList = new LinkedList<org.apache.lucene.analysis.Token>();
                tokensByPosition.add(currentList);
            }
            currentList.add(c);
        } else {
            currentList = new LinkedList<org.apache.lucene.analysis.Token>();
            tokensByPosition.add(currentList);
            currentList.add(c);
        }
    }

    // Build all the token sequences and see which ones get strung together

    LinkedList<LinkedList<org.apache.lucene.analysis.Token>> allTokenSequences = new LinkedList<LinkedList<org.apache.lucene.analysis.Token>>();
    for (LinkedList<org.apache.lucene.analysis.Token> tokensAtPosition : tokensByPosition) {
        if (allTokenSequences.size() == 0) {
            for (org.apache.lucene.analysis.Token t : tokensAtPosition) {
                LinkedList<org.apache.lucene.analysis.Token> newEntry = new LinkedList<org.apache.lucene.analysis.Token>();
                newEntry.add(t);
                allTokenSequences.add(newEntry);
            }
        } else {
            LinkedList<LinkedList<org.apache.lucene.analysis.Token>> newAllTokeSequences = new LinkedList<LinkedList<org.apache.lucene.analysis.Token>>();

            FOR_FIRST_TOKEN_AT_POSITION_ONLY: for (org.apache.lucene.analysis.Token t : tokensAtPosition) {
                boolean tokenFoundSequence = false;
                for (LinkedList<org.apache.lucene.analysis.Token> tokenSequence : allTokenSequences) {
                    LinkedList<org.apache.lucene.analysis.Token> newEntry = new LinkedList<org.apache.lucene.analysis.Token>();
                    newEntry.addAll(tokenSequence);
                    if (newEntry.getLast().endOffset() <= t.startOffset()) {
                        newEntry.add(t);
                        tokenFoundSequence = true;
                    }
                    newAllTokeSequences.add(newEntry);
                }
                if (false == tokenFoundSequence) {
                    LinkedList<org.apache.lucene.analysis.Token> newEntry = new LinkedList<org.apache.lucene.analysis.Token>();
                    newEntry.add(t);
                    newAllTokeSequences.add(newEntry);
                }
                // Limit the max number of permutations we consider
                if (newAllTokeSequences.size() > 64) {
                    break FOR_FIRST_TOKEN_AT_POSITION_ONLY;
                }
            }
            allTokenSequences = newAllTokeSequences;
        }
    }

    // build the uniquie

    LinkedList<LinkedList<org.apache.lucene.analysis.Token>> fixedTokenSequences = new LinkedList<LinkedList<org.apache.lucene.analysis.Token>>();
    for (LinkedList<org.apache.lucene.analysis.Token> tokenSequence : allTokenSequences) {
        LinkedList<org.apache.lucene.analysis.Token> fixedTokenSequence = new LinkedList<org.apache.lucene.analysis.Token>();
        fixedTokenSequences.add(fixedTokenSequence);
        org.apache.lucene.analysis.Token replace = null;
        for (org.apache.lucene.analysis.Token c : tokenSequence) {
            if (replace == null) {
                StringBuilder prefix = new StringBuilder();
                for (int i = c.startOffset() - 1; i >= 0; i--) {
                    char test = testText.charAt(i);
                    if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) {
                        prefix.insert(0, test);
                    } else {
                        break;
                    }
                }
                String pre = prefix.toString();
                if (requiresMLTokenDuplication) {
                    String termText = new String(c.termBuffer(), 0, c.termLength());
                    int position = termText.indexOf("}");
                    String language = termText.substring(0, position + 1);
                    String token = termText.substring(position + 1);
                    replace = new org.apache.lucene.analysis.Token(c.startOffset() - pre.length(),
                            c.endOffset());
                    replace.setTermBuffer(language + pre + token);
                    replace.setType(c.type());
                    replace.setPositionIncrement(c.getPositionIncrement());
                } else {
                    String termText = new String(c.termBuffer(), 0, c.termLength());
                    replace = new org.apache.lucene.analysis.Token(c.startOffset() - pre.length(),
                            c.endOffset());
                    replace.setTermBuffer(pre + termText);
                    replace.setType(c.type());
                    replace.setPositionIncrement(c.getPositionIncrement());
                }
            } else {
                StringBuilder prefix = new StringBuilder();
                StringBuilder postfix = new StringBuilder();
                StringBuilder builder = prefix;
                for (int i = c.startOffset() - 1; i >= replace.endOffset(); i--) {
                    char test = testText.charAt(i);
                    if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) {
                        builder.insert(0, test);
                    } else {
                        builder = postfix;
                        postfix.setLength(0);
                    }
                }
                String pre = prefix.toString();
                String post = postfix.toString();

                // Does it bridge?
                if ((pre.length() > 0) && (replace.endOffset() + pre.length()) == c.startOffset()) {
                    String termText = new String(c.termBuffer(), 0, c.termLength());
                    if (requiresMLTokenDuplication) {
                        int position = termText.indexOf("}");
                        @SuppressWarnings("unused")
                        String language = termText.substring(0, position + 1);
                        String token = termText.substring(position + 1);
                        int oldPositionIncrement = replace.getPositionIncrement();
                        String replaceTermText = new String(replace.termBuffer(), 0, replace.termLength());
                        replace = new org.apache.lucene.analysis.Token(replace.startOffset(), c.endOffset());
                        replace.setTermBuffer(replaceTermText + pre + token);
                        replace.setType(replace.type());
                        replace.setPositionIncrement(oldPositionIncrement);
                    } else {
                        int oldPositionIncrement = replace.getPositionIncrement();
                        String replaceTermText = new String(replace.termBuffer(), 0, replace.termLength());
                        replace = new org.apache.lucene.analysis.Token(replace.startOffset(), c.endOffset());
                        replace.setTermBuffer(replaceTermText + pre + termText);
                        replace.setType(replace.type());
                        replace.setPositionIncrement(oldPositionIncrement);
                    }
                } else {
                    String termText = new String(c.termBuffer(), 0, c.termLength());
                    if (requiresMLTokenDuplication) {
                        int position = termText.indexOf("}");
                        String language = termText.substring(0, position + 1);
                        String token = termText.substring(position + 1);
                        String replaceTermText = new String(replace.termBuffer(), 0, replace.termLength());
                        org.apache.lucene.analysis.Token last = new org.apache.lucene.analysis.Token(
                                replace.startOffset(), replace.endOffset() + post.length());
                        last.setTermBuffer(replaceTermText + post);
                        last.setType(replace.type());
                        last.setPositionIncrement(replace.getPositionIncrement());
                        fixedTokenSequence.add(last);
                        replace = new org.apache.lucene.analysis.Token(c.startOffset() - pre.length(),
                                c.endOffset());
                        replace.setTermBuffer(language + pre + token);
                        replace.setType(c.type());
                        replace.setPositionIncrement(c.getPositionIncrement());
                    } else {
                        String replaceTermText = new String(replace.termBuffer(), 0, replace.termLength());
                        org.apache.lucene.analysis.Token last = new org.apache.lucene.analysis.Token(
                                replace.startOffset(), replace.endOffset() + post.length());
                        last.setTermBuffer(replaceTermText + post);
                        last.setType(replace.type());
                        last.setPositionIncrement(replace.getPositionIncrement());
                        fixedTokenSequence.add(last);
                        replace = new org.apache.lucene.analysis.Token(c.startOffset() - pre.length(),
                                c.endOffset());
                        replace.setTermBuffer(pre + termText);
                        replace.setType(c.type());
                        replace.setPositionIncrement(c.getPositionIncrement());
                    }
                }
            }
        }
        // finish last
        if (replace != null) {
            StringBuilder postfix = new StringBuilder();
            if ((replace.endOffset() >= 0) && (replace.endOffset() < testText.length())) {
                for (int i = replace.endOffset(); i < testText.length(); i++) {
                    char test = testText.charAt(i);
                    if (((test == '*') || (test == '?')) && wildcardPoistions.contains(i)) {
                        postfix.append(test);
                    } else {
                        break;
                    }
                }
            }
            String post = postfix.toString();
            int oldPositionIncrement = replace.getPositionIncrement();
            String replaceTermText = new String(replace.termBuffer(), 0, replace.termLength());
            replace = new org.apache.lucene.analysis.Token(replace.startOffset(),
                    replace.endOffset() + post.length());
            replace.setTermBuffer(replaceTermText + post);
            replace.setType(replace.type());
            replace.setPositionIncrement(oldPositionIncrement);
            fixedTokenSequence.add(replace);
        }
    }

    // rebuild fixed list

    ArrayList<org.apache.lucene.analysis.Token> fixed = new ArrayList<org.apache.lucene.analysis.Token>();
    for (LinkedList<org.apache.lucene.analysis.Token> tokenSequence : fixedTokenSequences) {
        for (org.apache.lucene.analysis.Token token : tokenSequence) {
            fixed.add(token);
        }
    }

    // reorder by start position and increment

    Collections.sort(fixed, new Comparator<org.apache.lucene.analysis.Token>() {

        public int compare(Token o1, Token o2) {
            int dif = o1.startOffset() - o2.startOffset();
            if (dif != 0) {
                return dif;
            } else {
                return o1.getPositionIncrement() - o2.getPositionIncrement();
            }
        }
    });

    // make sure we remove any tokens we have duplicated

    @SuppressWarnings("rawtypes")
    OrderedHashSet unique = new OrderedHashSet();
    unique.addAll(fixed);
    fixed = new ArrayList<org.apache.lucene.analysis.Token>(unique);

    list = fixed;

    // add any missing locales back to the tokens

    if (localePrefix.length() > 0) {
        for (int j = 0; j < list.size(); j++) {
            org.apache.lucene.analysis.Token currentToken = list.get(j);
            String termText = new String(currentToken.termBuffer(), 0, currentToken.termLength());
            currentToken.setTermBuffer(localePrefix + termText);
        }
    }

    if (list.size() == 0)
        return null;
    else if (list.size() == 1) {
        nextToken = list.get(0);
        String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength());
        if (termText.contains("*") || termText.contains("?")) {
            return newWildcardQuery(
                    new Term(field, getLowercaseExpandedTerms() ? termText.toLowerCase() : termText));
        } else {
            return newTermQuery(new Term(field, termText));
        }
    } else {
        if (severalTokensAtSamePosition) {
            if (positionCount == 1) {
                // no phrase query:
                BooleanQuery q = newBooleanQuery(true);
                for (int i = 0; i < list.size(); i++) {
                    Query currentQuery;
                    nextToken = list.get(i);
                    String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength());
                    if (termText.contains("*") || termText.contains("?")) {
                        currentQuery = newWildcardQuery(new Term(field,
                                getLowercaseExpandedTerms() ? termText.toLowerCase() : termText));
                    } else {
                        currentQuery = newTermQuery(new Term(field, termText));
                    }
                    q.add(currentQuery, BooleanClause.Occur.SHOULD);
                }
                return q;
            }
            // Consider if we can use a multi-phrase query (e.g for synonym use rather then WordDelimiterFilterFactory)
            else if (canUseMultiPhraseQuery(fixedTokenSequences)) {
                // phrase query:
                MultiPhraseQuery mpq = newMultiPhraseQuery();
                mpq.setSlop(internalSlop);
                ArrayList<Term> multiTerms = new ArrayList<Term>();
                int position = 0;
                for (int i = 0; i < list.size(); i++) {
                    nextToken = list.get(i);
                    String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength());

                    Term term = new Term(field, termText);
                    if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                        addWildcardTerms(multiTerms, term);
                    } else {
                        multiTerms.add(term);
                    }

                    if (nextToken.getPositionIncrement() > 0 && multiTerms.size() > 0) {
                        if (getEnablePositionIncrements()) {
                            mpq.add(multiTerms.toArray(new Term[0]), position);
                        } else {
                            mpq.add(multiTerms.toArray(new Term[0]));
                        }
                        checkTermCount(field, queryText, mpq);
                        multiTerms.clear();
                    }
                    position += nextToken.getPositionIncrement();

                }
                if (getEnablePositionIncrements()) {
                    if (multiTerms.size() > 0) {
                        mpq.add(multiTerms.toArray(new Term[0]), position);
                    }
                    //                        else
                    //                        {
                    //                            mpq.add(new Term[] { new Term(field, "\u0000") }, position);
                    //                        }
                } else {
                    if (multiTerms.size() > 0) {
                        mpq.add(multiTerms.toArray(new Term[0]));
                    }
                    //                        else
                    //                        {
                    //                            mpq.add(new Term[] { new Term(field, "\u0000") });
                    //                        }
                }
                checkTermCount(field, queryText, mpq);
                return mpq;

            }
            // Word delimiter factory and other odd things generate complex token patterns
            // Smart skip token  sequences with small tokens that generate toomany wildcards
            // Fall back to the larger pattern
            // e.g Site1* will not do (S ite 1*) or (Site 1*)  if 1* matches too much (S ite1*)  and (Site1*) will still be OK 
            // If we skip all (for just 1* in the input) this is still an issue.
            else {
                boolean skippedTokens = false;
                BooleanQuery q = newBooleanQuery(true);
                TOKEN_SEQUENCE: for (LinkedList<org.apache.lucene.analysis.Token> tokenSequence : fixedTokenSequences) {
                    // phrase query:
                    MultiPhraseQuery mpq = newMultiPhraseQuery();
                    mpq.setSlop(internalSlop);
                    int position = 0;
                    for (int i = 0; i < tokenSequence.size(); i++) {
                        nextToken = (org.apache.lucene.analysis.Token) tokenSequence.get(i);
                        String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength());

                        Term term = new Term(field, termText);

                        if (getEnablePositionIncrements()) {
                            if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                                mpq.add(getMatchingTerms(field, term), position);
                            } else {
                                mpq.add(new Term[] { term }, position);
                            }
                            if (exceedsTermCount(mpq)) {
                                // We could duplicate the token sequence without the failing wildcard expansion and try again ??
                                skippedTokens = true;
                                continue TOKEN_SEQUENCE;
                            }
                            if (nextToken.getPositionIncrement() > 0) {
                                position += nextToken.getPositionIncrement();
                            } else {
                                position++;
                            }

                        } else {
                            if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                                mpq.add(getMatchingTerms(field, term));
                            } else {
                                mpq.add(term);
                            }
                            if (exceedsTermCount(mpq)) {
                                skippedTokens = true;
                                continue TOKEN_SEQUENCE;
                            }
                        }
                    }
                    q.add(mpq, BooleanClause.Occur.SHOULD);
                }
                if (skippedTokens && (q.clauses().size() == 0)) {
                    throw new LuceneQueryParserException(
                            "Query skipped all token sequences as wildcards generated too many clauses: "
                                    + field + " " + queryText);
                }
                return q;
            }
        } else {
            MultiPhraseQuery q = new MultiPhraseQuery();
            q.setSlop(internalSlop);
            int position = 0;
            for (int i = 0; i < list.size(); i++) {
                nextToken = list.get(i);
                String termText = new String(nextToken.termBuffer(), 0, nextToken.termLength());
                Term term = new Term(field, termText);
                if (getEnablePositionIncrements()) {
                    if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                        q.add(getMatchingTerms(field, term), position);
                    } else {
                        q.add(new Term[] { term }, position);
                    }
                    checkTermCount(field, queryText, q);
                    if (nextToken.getPositionIncrement() > 0) {
                        position += nextToken.getPositionIncrement();
                    } else {
                        position++;
                    }
                } else {
                    if ((termText != null) && (termText.contains("*") || termText.contains("?"))) {
                        q.add(getMatchingTerms(field, term));
                    } else {
                        q.add(term);
                    }
                    checkTermCount(field, queryText, q);
                }
            }
            return q;
        }
    }
}

From source file:org.apache.blur.lucene.search.BlurQueryParser.java

License:Apache License

@Override
protected MultiPhraseQuery newMultiPhraseQuery() {
    return new MultiPhraseQuery() {
        @Override/*from   w ww.j av  a  2 s. c  o m*/
        public void add(Term[] terms, int position) {
            super.add(terms, position);
            for (Term term : terms) {
                String resolvedField = _fieldManager.resolveField(term.field());
                customQueryCheck(resolvedField);
                addField(this, resolvedField);
            }
        }
    };
}