Java tutorial
/** * IK ? 5.0 * IK Analyzer release 5.0 * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * ??(linliangyi2005@gmail.com)?? * ? 2012 * provided by Linliangyi and copyright 2012 by Oolong studio * */ package cc.pp.analyzer.ik.query; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.Stack; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.util.BytesRef; /** * IK?? * ?SWMCQuery * * ?? * (id='1231231' && title:'monkey') || (content:'?' || ulr='www.ik.com') - name:'helloword' * @author linliangyi * */ public class IKQueryExpressionParser { //public static final String LUCENE_SPECIAL_CHAR = "&&||-()':={}[],"; private final List<Element> elements = new ArrayList<Element>(); private final Stack<Query> querys = new Stack<Query>(); private final Stack<Element> operates = new Stack<Element>(); /** * ???Lucene Query * * @param expression * @param quickMode * @return Lucene query */ public Query parseExp(String expression, boolean quickMode) { Query lucenceQuery = null; if (expression != null && !"".equals(expression.trim())) { try { //? this.splitElements(expression); //? this.parseSyntax(quickMode); if (this.querys.size() == 1) { lucenceQuery = this.querys.pop(); } else { throw new IllegalStateException("? ? ?"); } } finally { elements.clear(); querys.clear(); operates.clear(); } } return lucenceQuery; } /** * ?? * @param expression */ private void splitElements(String expression) { if (expression == null) { return; } Element curretElement = null; char[] expChars = expression.toCharArray(); for (int i = 0; i < expChars.length; i++) { switch (expChars[i]) { case '&': if (curretElement == null) { curretElement = new Element(); curretElement.type = '&'; curretElement.append(expChars[i]); } else if (curretElement.type == '&') { curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; } else if (curretElement.type == '\'') { curretElement.append(expChars[i]); } else { this.elements.add(curretElement); curretElement = new Element(); curretElement.type = '&'; curretElement.append(expChars[i]); } break; case '|': if (curretElement == null) { curretElement = new Element(); curretElement.type = '|'; curretElement.append(expChars[i]); } else if (curretElement.type == '|') { curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; } else if (curretElement.type == '\'') { curretElement.append(expChars[i]); } else { this.elements.add(curretElement); curretElement = new Element(); curretElement.type = '|'; curretElement.append(expChars[i]); } break; case '-': if (curretElement != null) { if (curretElement.type == '\'') { curretElement.append(expChars[i]); continue; } else { this.elements.add(curretElement); } } curretElement = new Element(); curretElement.type = '-'; curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; break; case '(': if (curretElement != null) { if (curretElement.type == '\'') { curretElement.append(expChars[i]); continue; } else { this.elements.add(curretElement); } } curretElement = new Element(); curretElement.type = '('; curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; break; case ')': if (curretElement != null) { if (curretElement.type == '\'') { curretElement.append(expChars[i]); continue; } else { this.elements.add(curretElement); } } curretElement = new Element(); curretElement.type = ')'; curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; break; case ':': if (curretElement != null) { if (curretElement.type == '\'') { curretElement.append(expChars[i]); continue; } else { this.elements.add(curretElement); } } curretElement = new Element(); curretElement.type = ':'; curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; break; case '=': if (curretElement != null) { if (curretElement.type == '\'') { curretElement.append(expChars[i]); continue; } else { this.elements.add(curretElement); } } curretElement = new Element(); curretElement.type = '='; curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; break; case ' ': if (curretElement != null) { if (curretElement.type == '\'') { curretElement.append(expChars[i]); } else { this.elements.add(curretElement); curretElement = null; } } break; case '\'': if (curretElement == null) { curretElement = new Element(); curretElement.type = '\''; } else if (curretElement.type == '\'') { this.elements.add(curretElement); curretElement = null; } else { this.elements.add(curretElement); curretElement = new Element(); curretElement.type = '\''; } break; case '[': if (curretElement != null) { if (curretElement.type == '\'') { curretElement.append(expChars[i]); continue; } else { this.elements.add(curretElement); } } curretElement = new Element(); curretElement.type = '['; curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; break; case ']': if (curretElement != null) { if (curretElement.type == '\'') { curretElement.append(expChars[i]); continue; } else { this.elements.add(curretElement); } } curretElement = new Element(); curretElement.type = ']'; curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; break; case '{': if (curretElement != null) { if (curretElement.type == '\'') { curretElement.append(expChars[i]); continue; } else { this.elements.add(curretElement); } } curretElement = new Element(); curretElement.type = '{'; curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; break; case '}': if (curretElement != null) { if (curretElement.type == '\'') { curretElement.append(expChars[i]); continue; } else { this.elements.add(curretElement); } } curretElement = new Element(); curretElement.type = '}'; curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; break; case ',': if (curretElement != null) { if (curretElement.type == '\'') { curretElement.append(expChars[i]); continue; } else { this.elements.add(curretElement); } } curretElement = new Element(); curretElement.type = ','; curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; break; default: if (curretElement == null) { curretElement = new Element(); curretElement.type = 'F'; curretElement.append(expChars[i]); } else if (curretElement.type == 'F') { curretElement.append(expChars[i]); } else if (curretElement.type == '\'') { curretElement.append(expChars[i]); } else { this.elements.add(curretElement); curretElement = new Element(); curretElement.type = 'F'; curretElement.append(expChars[i]); } } } if (curretElement != null) { this.elements.add(curretElement); curretElement = null; } } /** * ? * */ private void parseSyntax(boolean quickMode) { for (int i = 0; i < this.elements.size(); i++) { Element e = this.elements.get(i); if ('F' == e.type) { Element e2 = this.elements.get(i + 1); if ('=' != e2.type && ':' != e2.type) { throw new IllegalStateException("? = ?"); } Element e3 = this.elements.get(i + 2); //? = ? if ('\'' == e3.type) { i += 2; if ('=' == e2.type) { TermQuery tQuery = new TermQuery(new Term(e.toString(), e3.toString())); this.querys.push(tQuery); } else if (':' == e2.type) { String keyword = e3.toString(); //SWMCQuery Here Query _SWMCQuery = SWMCQueryBuilder.create(e.toString(), keyword, quickMode); this.querys.push(_SWMCQuery); } } else if ('[' == e3.type || '{' == e3.type) { i += 2; //? [] {} LinkedList<Element> eQueue = new LinkedList<Element>(); eQueue.add(e3); for (i++; i < this.elements.size(); i++) { Element eN = this.elements.get(i); eQueue.add(eN); if (']' == eN.type || '}' == eN.type) { break; } } //RangeQuery Query rangeQuery = this.toTermRangeQuery(e, eQueue); this.querys.push(rangeQuery); } else { throw new IllegalStateException("??"); } } else if ('(' == e.type) { this.operates.push(e); } else if (')' == e.type) { boolean doPop = true; while (doPop && !this.operates.empty()) { Element op = this.operates.pop(); if ('(' == op.type) { doPop = false; } else { Query q = toBooleanQuery(op); this.querys.push(q); } } } else { if (this.operates.isEmpty()) { this.operates.push(e); } else { boolean doPeek = true; while (doPeek && !this.operates.isEmpty()) { Element eleOnTop = this.operates.peek(); if ('(' == eleOnTop.type) { doPeek = false; this.operates.push(e); } else if (compare(e, eleOnTop) == 1) { this.operates.push(e); doPeek = false; } else if (compare(e, eleOnTop) == 0) { Query q = toBooleanQuery(eleOnTop); this.operates.pop(); this.querys.push(q); } else { Query q = toBooleanQuery(eleOnTop); this.operates.pop(); this.querys.push(q); } } if (doPeek && this.operates.empty()) { this.operates.push(e); } } } } while (!this.operates.isEmpty()) { Element eleOnTop = this.operates.pop(); Query q = toBooleanQuery(eleOnTop); this.querys.push(q); } } /** * ???BooleanQuery * @param op * @return */ private Query toBooleanQuery(Element op) { if (this.querys.size() == 0) { return null; } BooleanQuery resultQuery = new BooleanQuery(); if (this.querys.size() == 1) { return this.querys.get(0); } Query q2 = this.querys.pop(); Query q1 = this.querys.pop(); if ('&' == op.type) { if (q1 != null) { if (q1 instanceof BooleanQuery) { BooleanClause[] clauses = ((BooleanQuery) q1).getClauses(); if (clauses.length > 0 && clauses[0].getOccur() == Occur.MUST) { for (BooleanClause c : clauses) { resultQuery.add(c); } } else { resultQuery.add(q1, Occur.MUST); } } else { //q1 instanceof TermQuery //q1 instanceof TermRangeQuery //q1 instanceof PhraseQuery //others resultQuery.add(q1, Occur.MUST); } } if (q2 != null) { if (q2 instanceof BooleanQuery) { BooleanClause[] clauses = ((BooleanQuery) q2).getClauses(); if (clauses.length > 0 && clauses[0].getOccur() == Occur.MUST) { for (BooleanClause c : clauses) { resultQuery.add(c); } } else { resultQuery.add(q2, Occur.MUST); } } else { //q1 instanceof TermQuery //q1 instanceof TermRangeQuery //q1 instanceof PhraseQuery //others resultQuery.add(q2, Occur.MUST); } } } else if ('|' == op.type) { if (q1 != null) { if (q1 instanceof BooleanQuery) { BooleanClause[] clauses = ((BooleanQuery) q1).getClauses(); if (clauses.length > 0 && clauses[0].getOccur() == Occur.SHOULD) { for (BooleanClause c : clauses) { resultQuery.add(c); } } else { resultQuery.add(q1, Occur.SHOULD); } } else { //q1 instanceof TermQuery //q1 instanceof TermRangeQuery //q1 instanceof PhraseQuery //others resultQuery.add(q1, Occur.SHOULD); } } if (q2 != null) { if (q2 instanceof BooleanQuery) { BooleanClause[] clauses = ((BooleanQuery) q2).getClauses(); if (clauses.length > 0 && clauses[0].getOccur() == Occur.SHOULD) { for (BooleanClause c : clauses) { resultQuery.add(c); } } else { resultQuery.add(q2, Occur.SHOULD); } } else { //q2 instanceof TermQuery //q2 instanceof TermRangeQuery //q2 instanceof PhraseQuery //others resultQuery.add(q2, Occur.SHOULD); } } } else if ('-' == op.type) { if (q1 == null || q2 == null) { throw new IllegalStateException("?SubQuery ??"); } if (q1 instanceof BooleanQuery) { BooleanClause[] clauses = ((BooleanQuery) q1).getClauses(); if (clauses.length > 0) { for (BooleanClause c : clauses) { resultQuery.add(c); } } else { resultQuery.add(q1, Occur.MUST); } } else { //q1 instanceof TermQuery //q1 instanceof TermRangeQuery //q1 instanceof PhraseQuery //others resultQuery.add(q1, Occur.MUST); } resultQuery.add(q2, Occur.MUST_NOT); } return resultQuery; } /** * TermRangeQuery * @param elements * @return */ private TermRangeQuery toTermRangeQuery(Element fieldNameEle, LinkedList<Element> elements) { boolean includeFirst = false; boolean includeLast = false; String firstValue = null; String lastValue = null; //?[{ Element first = elements.getFirst(); if ('[' == first.type) { includeFirst = true; } else if ('{' == first.type) { includeFirst = false; } else { throw new IllegalStateException("?"); } //??]} Element last = elements.getLast(); if (']' == last.type) { includeLast = true; } else if ('}' == last.type) { includeLast = false; } else { throw new IllegalStateException("?, RangeQuery??"); } if (elements.size() < 4 || elements.size() > 5) { throw new IllegalStateException("?, RangeQuery "); } // Element e2 = elements.get(1); if ('\'' == e2.type) { firstValue = e2.toString(); // Element e3 = elements.get(2); if (',' != e3.type) { throw new IllegalStateException("?, RangeQuery?"); } // Element e4 = elements.get(3); if ('\'' == e4.type) { lastValue = e4.toString(); } else if (e4 != last) { throw new IllegalStateException("?RangeQuery?"); } } else if (',' == e2.type) { firstValue = null; // Element e3 = elements.get(2); if ('\'' == e3.type) { lastValue = e3.toString(); } else { throw new IllegalStateException("?RangeQuery?"); } } else { throw new IllegalStateException("?, RangeQuery?"); } /** * lucene4.6? */ TermRangeQuery query = null; try { query = new TermRangeQuery(fieldNameEle.toString(), new BytesRef(firstValue.getBytes("UTF8")), new BytesRef(lastValue.getBytes("UTF8")), includeFirst, includeLast); } catch (UnsupportedEncodingException e) { throw new IllegalStateException("?, ??"); } return query; } /** * ? * @param e1 * @param e2 * @return */ private int compare(Element e1, Element e2) { if ('&' == e1.type) { if ('&' == e2.type) { return 0; } else { return 1; } } else if ('|' == e1.type) { if ('&' == e2.type) { return -1; } else if ('|' == e2.type) { return 0; } else { return 1; } } else { if ('-' == e2.type) { return 0; } else { return -1; } } } /** * ???FieldName?FieldValue * @author linliangyi * May 20, 2010 */ private class Element { char type = 0; StringBuffer eleTextBuff; public Element() { eleTextBuff = new StringBuffer(); } public void append(char c) { this.eleTextBuff.append(c); } @Override public String toString() { return this.eleTextBuff.toString(); } } public static void main(String[] args) { IKQueryExpressionParser parser = new IKQueryExpressionParser(); //String ikQueryExp = "newsTitle:'?Bigfoot?'"; String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:''"; Query result = parser.parseExp(ikQueryExp, true); System.out.println(result); } }