Java tutorial
/** * Copyright (c)2010-2011 Enterprise Website Content Management System(EWCMS), All rights reserved. * EWCMS PROPRIETARY/CONFIDENTIAL. Use is subject to license terms. * http://www.ewcms.com */ package com.ewcms.content.document.util.analyzer.lucene; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Stack; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.BooleanClause.Occur; import com.ewcms.content.document.util.analyzer.IKSegmentation; import com.ewcms.content.document.util.analyzer.Lexeme; /** * <ul> * IK? * ???? * ?? * IK Analyzer V3 * </ul> * * @author ? */ public final class IKQueryParser { //??? private static ThreadLocal<Map<String, TokenBranch>> keywordCacheThreadLocal = new ThreadLocal<Map<String, TokenBranch>>(); //??? private static boolean isMaxWordLength = false; /** * ? * isMaxWordLength = true ?? * @param isMaxWordLength */ public static void setMaxWordLength(boolean isMaxWordLength) { IKQueryParser.isMaxWordLength = isMaxWordLength; } /** * query * ?Query? * @param queries * @return */ private static Query optimizeQueries(List<Query> queries) { //??branch query if (queries.size() == 0) { return null; } else if (queries.size() == 1) { return queries.get(0); } else { BooleanQuery mustQueries = new BooleanQuery(); for (Query q : queries) { mustQueries.add(q, Occur.MUST); } return mustQueries; } } /** * ?? * @return */ private static Map<String, TokenBranch> getTheadLocalCache() { Map<String, TokenBranch> keywordCache = keywordCacheThreadLocal.get(); if (keywordCache == null) { keywordCache = new HashMap<String, TokenBranch>(4); keywordCacheThreadLocal.set(keywordCache); } return keywordCache; } /** * ?? * @param query * @return */ private static TokenBranch getCachedTokenBranch(String query) { Map<String, TokenBranch> keywordCache = getTheadLocalCache(); return keywordCache.get(query); } /** * ?? * @param query * @return */ private static void cachedTokenBranch(String query, TokenBranch tb) { Map<String, TokenBranch> keywordCache = getTheadLocalCache(); keywordCache.put(query, tb); } /** * ???Field? * @param field * @param query * @return * @throws IOException */ private static Query _parse(String field, String query) throws IOException { if (field == null) { throw new IllegalArgumentException("parameter \"field\" is null"); } if (query == null || "".equals(query.trim())) { return new TermQuery(new Term(field)); } //???queryTokenBranch TokenBranch root = getCachedTokenBranch(query); if (root != null) { return optimizeQueries(root.toQueries(field)); } else { root = new TokenBranch(null); //?q? StringReader input = new StringReader(query.trim()); IKSegmentation ikSeg = new IKSegmentation(input, isMaxWordLength); for (Lexeme lexeme = ikSeg.next(); lexeme != null; lexeme = ikSeg.next()) { //?? root.accept(lexeme); } //?? cachedTokenBranch(query, root); return optimizeQueries(root.toQueries(field)); } } /** * ?IK? * @param ikQueryExp * @return Query */ public static Query parse(String ikQueryExp) { ExpressionParser ikExpParser = new ExpressionParser(); return ikExpParser.parserExp(ikQueryExp); } /** /** * ??,?Field? * @param field -- Document field name * @param query -- keyword * @return Query * @throws IOException */ public static Query parse(String field, String query) throws IOException { if (field == null) { throw new IllegalArgumentException("parameter \"field\" is null"); } String[] qParts = query.split("\\s"); if (qParts.length > 1) { BooleanQuery resultQuery = new BooleanQuery(); for (String q : qParts) { //? if ("".equals(q)) { continue; } Query partQuery = _parse(field, q); if (partQuery != null && (!(partQuery instanceof BooleanQuery) || ((BooleanQuery) partQuery).getClauses().length > 0)) { resultQuery.add(partQuery, Occur.SHOULD); } } return resultQuery; } else { return _parse(field, query); } } /** * Field,??? * @param fields -- Document fields name * @param query -- keyword * @return Query * @throws IOException */ public static Query parseMultiField(String[] fields, String query) throws IOException { if (fields == null) { throw new IllegalArgumentException("parameter \"fields\" is null"); } BooleanQuery resultQuery = new BooleanQuery(); for (String field : fields) { if (field != null) { Query partQuery = parse(field, query); if (partQuery != null && (!(partQuery instanceof BooleanQuery) || ((BooleanQuery) partQuery).getClauses().length > 0)) { resultQuery.add(partQuery, Occur.SHOULD); } } } return resultQuery; } /** * Field,??,Occur? * @param fields -- Document fields name * @param query -- keyword * @param flags -- BooleanClause * @return Query * @throws IOException */ public static Query parseMultiField(String[] fields, String query, BooleanClause.Occur[] flags) throws IOException { if (fields == null) { throw new IllegalArgumentException("parameter \"fields\" is null"); } if (flags == null) { throw new IllegalArgumentException("parameter \"flags\" is null"); } if (flags.length != fields.length) { throw new IllegalArgumentException("flags.length != fields.length"); } BooleanQuery resultQuery = new BooleanQuery(); for (int i = 0; i < fields.length; i++) { if (fields[i] != null) { Query partQuery = parse(fields[i], query); if (partQuery != null && (!(partQuery instanceof BooleanQuery) || ((BooleanQuery) partQuery).getClauses().length > 0)) { resultQuery.add(partQuery, flags[i]); } } } return resultQuery; } /** * Field?? * @param fields * @param queries * @return Query * @throws IOException */ public static Query parseMultiField(String[] fields, String[] queries) throws IOException { if (fields == null) { throw new IllegalArgumentException("parameter \"fields\" is null"); } if (queries == null) { throw new IllegalArgumentException("parameter \"queries\" is null"); } if (queries.length != fields.length) { throw new IllegalArgumentException("queries.length != fields.length"); } BooleanQuery resultQuery = new BooleanQuery(); for (int i = 0; i < fields.length; i++) { if (fields[i] != null) { Query partQuery = parse(fields[i], queries[i]); if (partQuery != null && (!(partQuery instanceof BooleanQuery) || ((BooleanQuery) partQuery).getClauses().length > 0)) { resultQuery.add(partQuery, Occur.SHOULD); } } } return resultQuery; } /** * Field,?,Occur? * @param fields * @param queries * @param flags * @return Query * @throws IOException */ public static Query parseMultiField(String[] fields, String[] queries, BooleanClause.Occur[] flags) throws IOException { if (fields == null) { throw new IllegalArgumentException("parameter \"fields\" is null"); } if (queries == null) { throw new IllegalArgumentException("parameter \"queries\" is null"); } if (flags == null) { throw new IllegalArgumentException("parameter \"flags\" is null"); } if (!(queries.length == fields.length && queries.length == flags.length)) { throw new IllegalArgumentException("queries, fields, and flags array have have different length"); } BooleanQuery resultQuery = new BooleanQuery(); for (int i = 0; i < fields.length; i++) { if (fields[i] != null) { Query partQuery = parse(fields[i], queries[i]); if (partQuery != null && (!(partQuery instanceof BooleanQuery) || ((BooleanQuery) partQuery).getClauses().length > 0)) { resultQuery.add(partQuery, flags[i]); } } } return resultQuery; } /** * ? * ????? * @author * */ private static class TokenBranch { private static final int REFUSED = -1; private static final int ACCEPTED = 0; private static final int TONEXT = 1; //? private int leftBorder; //?? private int rightBorder; //?? private Lexeme lexeme; //??? private List<TokenBranch> acceptedBranchs; //?? private TokenBranch nextBranch; TokenBranch(Lexeme lexeme) { if (lexeme != null) { this.lexeme = lexeme; //?branch? this.leftBorder = lexeme.getBeginPosition(); this.rightBorder = lexeme.getEndPosition(); } } @SuppressWarnings("unused") public int getLeftBorder() { return leftBorder; } @SuppressWarnings("unused") public int getRightBorder() { return rightBorder; } public Lexeme getLexeme() { return lexeme; } @SuppressWarnings("unused") public List<TokenBranch> getAcceptedBranchs() { return acceptedBranchs; } @SuppressWarnings("unused") public TokenBranch getNextBranch() { return nextBranch; } public int hashCode() { if (this.lexeme == null) { return 0; } else { return this.lexeme.hashCode() * 37; } } public boolean equals(Object o) { if (o == null) { return false; } if (this == o) { return true; } if (o instanceof TokenBranch) { TokenBranch other = (TokenBranch) o; if (this.lexeme == null || other.getLexeme() == null) { return false; } else { return this.lexeme.equals(other.getLexeme()); } } else { return false; } } /** * ?? * @param _lexeme * @return ?branch?? */ boolean accept(Lexeme _lexeme) { /* * lexeme ?branch ?? * acceptType : REFUSED ?? * acceptType : ACCEPTED ? * acceptType : TONEXT ? */ int acceptType = checkAccept(_lexeme); switch (acceptType) { case REFUSED: // REFUSE return false; case ACCEPTED: if (acceptedBranchs == null) { //?branch?branch?branch acceptedBranchs = new ArrayList<TokenBranch>(2); acceptedBranchs.add(new TokenBranch(_lexeme)); } else { boolean acceptedByChild = false; //?branch?branch?branch for (TokenBranch childBranch : acceptedBranchs) { acceptedByChild = childBranch.accept(_lexeme) || acceptedByChild; } //?branch??branch if (!acceptedByChild) { acceptedBranchs.add(new TokenBranch(_lexeme)); } } //branch? if (_lexeme.getEndPosition() > this.rightBorder) { this.rightBorder = _lexeme.getEndPosition(); } break; case TONEXT: //lexeme?branch if (this.nextBranch == null) { //?? this.nextBranch = new TokenBranch(null); } this.nextBranch.accept(_lexeme); break; } return true; } /** * ??Query * @return */ List<Query> toQueries(String fieldName) { List<Query> queries = new ArrayList<Query>(1); //??branch query if (lexeme != null) { queries.add(new TermQuery(new Term(fieldName, lexeme.getLexemeText()))); } //?child branch query if (acceptedBranchs != null && acceptedBranchs.size() > 0) { if (acceptedBranchs.size() == 1) { Query onlyOneQuery = optimizeQueries(acceptedBranchs.get(0).toQueries(fieldName)); if (onlyOneQuery != null) { queries.add(onlyOneQuery); } } else { BooleanQuery orQuery = new BooleanQuery(); for (TokenBranch childBranch : acceptedBranchs) { Query childQuery = optimizeQueries(childBranch.toQueries(fieldName)); if (childQuery != null) { orQuery.add(childQuery, Occur.SHOULD); } } if (orQuery.getClauses().length > 0) { queries.add(orQuery); } } } //?nextBranchquery if (nextBranch != null) { queries.addAll(nextBranch.toQueries(fieldName)); } return queries; } /** * lexeme??branch? * @param lexeme * @return ?? */ private int checkAccept(Lexeme _lexeme) { int acceptType = 0; if (_lexeme == null) { throw new IllegalArgumentException("parameter:lexeme is null"); } if (null == this.lexeme) {//?branch??ROOT if (this.rightBorder > 0 //?branchlexeme && _lexeme.getBeginPosition() >= this.rightBorder) { //_lexeme ?branch? acceptType = TONEXT; } else { acceptType = ACCEPTED; } } else {//?branch? if (_lexeme.getBeginPosition() < this.lexeme.getBeginPosition()) { //_lexeme ? this.lexeme????? acceptType = REFUSED; } else if (_lexeme.getBeginPosition() >= this.lexeme.getBeginPosition() && _lexeme.getBeginPosition() < this.lexeme.getEndPosition()) { // _lexeme this.lexeme acceptType = REFUSED; } else if (_lexeme.getBeginPosition() >= this.lexeme.getEndPosition() && _lexeme.getBeginPosition() < this.rightBorder) { //_lexeme this.lexeme ? _lexeme ?branch acceptType = ACCEPTED; } else {//_lexeme.getBeginPosition() >= this.rightBorder //_lexeme ?branch? acceptType = TONEXT; } } return acceptType; } } /** * ?? * alpha * lucene? * ?? * (id='1231231' && title:'monkey') || (content:'?' || ulr='www.ik.com') - name:'helloword' * * @author linliangyi * May 20, 2010 */ static class ExpressionParser { //public static final String LUCENE_SPECIAL_CHAR = "&&||-()':="; private List<Element> elements = new ArrayList<Element>(); private Stack<Query> querys = new Stack<Query>(); private Stack<Element> operates = new Stack<Element>(); public ExpressionParser() { } /** * ???Lucene Query * * @param expression * @return */ public Query parserExp(String expression) { Query lucenceQuery = null; try { //? this.splitElements(expression); //? this.parseSyntax(); if (this.querys.size() == 1) { lucenceQuery = this.querys.pop(); } else { throw new IllegalStateException("? ?"); } } finally { elements.clear(); querys.clear(); operates.clear(); } return lucenceQuery; } /** * ?? * @param expression */ private void splitElements(String expression) { if (expression == null) { return; } Element curretElement = null; char[] expChars = expression.toCharArray(); for (int i = 0; i < expChars.length; i++) { switch (expChars[i]) { case '&': if (curretElement == null) { curretElement = new Element(); curretElement.type = '&'; curretElement.append(expChars[i]); } else if (curretElement.type == '&') { curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; } else if (curretElement.type == '\'') { curretElement.append(expChars[i]); } else { this.elements.add(curretElement); curretElement = new Element(); curretElement.type = '&'; curretElement.append(expChars[i]); } break; case '|': if (curretElement == null) { curretElement = new Element(); curretElement.type = '|'; curretElement.append(expChars[i]); } else if (curretElement.type == '|') { curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; } else if (curretElement.type == '\'') { curretElement.append(expChars[i]); } else { this.elements.add(curretElement); curretElement = new Element(); curretElement.type = '|'; curretElement.append(expChars[i]); } break; case '-': if (curretElement != null) { if (curretElement.type == '\'') { curretElement.append(expChars[i]); continue; } else { this.elements.add(curretElement); } } curretElement = new Element(); curretElement.type = '-'; curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; break; case '(': if (curretElement != null) { if (curretElement.type == '\'') { curretElement.append(expChars[i]); continue; } else { this.elements.add(curretElement); } } curretElement = new Element(); curretElement.type = '('; curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; break; case ')': if (curretElement != null) { if (curretElement.type == '\'') { curretElement.append(expChars[i]); continue; } else { this.elements.add(curretElement); } } curretElement = new Element(); curretElement.type = ')'; curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; break; case ':': if (curretElement != null) { if (curretElement.type == '\'') { curretElement.append(expChars[i]); continue; } else { this.elements.add(curretElement); } } curretElement = new Element(); curretElement.type = ':'; curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; break; case '=': if (curretElement != null) { if (curretElement.type == '\'') { curretElement.append(expChars[i]); continue; } else { this.elements.add(curretElement); } } curretElement = new Element(); curretElement.type = '='; curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; break; case ' ': if (curretElement != null) { if (curretElement.type == '\'') { curretElement.append(expChars[i]); } else { this.elements.add(curretElement); curretElement = null; } } break; case '\'': if (curretElement == null) { curretElement = new Element(); curretElement.type = '\''; } else if (curretElement.type == '\'') { this.elements.add(curretElement); curretElement = null; } else { this.elements.add(curretElement); curretElement = new Element(); curretElement.type = '\''; } break; default: if (curretElement == null) { curretElement = new Element(); curretElement.type = 'F'; curretElement.append(expChars[i]); } else if (curretElement.type == 'F') { curretElement.append(expChars[i]); } else if (curretElement.type == '\'') { curretElement.append(expChars[i]); } else { this.elements.add(curretElement); curretElement = new Element(); curretElement.type = 'F'; curretElement.append(expChars[i]); } } } if (curretElement != null) { this.elements.add(curretElement); curretElement = null; } } /** * ? * */ private void parseSyntax() { for (int i = 0; i < this.elements.size(); i++) { Element e = this.elements.get(i); if ('F' == e.type) { Element e2 = this.elements.get(i + 1); if ('=' != e2.type && ':' != e2.type) { throw new IllegalStateException("? = ?"); } Element e3 = this.elements.get(i + 2); if ('\'' != e3.type) { throw new IllegalStateException("??"); } i += 2; if ('=' == e2.type) { TermQuery tQuery = new TermQuery(new Term(e.toString(), e3.toString())); this.querys.push(tQuery); } else if (':' == e2.type) { try { Query tQuery = IKQueryParser.parse(e.toString(), e3.toString()); this.querys.push(tQuery); } catch (IOException e1) { e1.printStackTrace(); } } } else if ('(' == e.type) { this.operates.push(e); } else if (')' == e.type) { boolean doPop = true; while (doPop && !this.operates.empty()) { Element op = this.operates.pop(); if ('(' == op.type) { doPop = false; } else { Query q = toQuery(op); this.querys.push(q); } } } else { if (this.operates.isEmpty()) { this.operates.push(e); } else { boolean doPeek = true; while (doPeek && !this.operates.isEmpty()) { Element eleOnTop = this.operates.peek(); if ('(' == eleOnTop.type) { doPeek = false; this.operates.push(e); } else if (compare(e, eleOnTop) == 1) { this.operates.push(e); doPeek = false; } else if (compare(e, eleOnTop) == 0) { Query q = toQuery(eleOnTop); this.operates.pop(); this.querys.push(q); } else { Query q = toQuery(eleOnTop); this.operates.pop(); this.querys.push(q); } } if (doPeek && this.operates.empty()) { this.operates.push(e); } } } } while (!this.operates.isEmpty()) { Element eleOnTop = this.operates.pop(); Query q = toQuery(eleOnTop); this.querys.push(q); } } /** * ???BooleanQuery * @param op * @return */ private Query toQuery(Element op) { BooleanQuery resultQuery = new BooleanQuery(); if (this.querys.size() < 2) { throw new IllegalStateException("?SubQuery ??"); } Query q2 = this.querys.pop(); Query q1 = this.querys.pop(); if ('&' == op.type) { if (q1 instanceof TermQuery) { resultQuery.add(q1, Occur.MUST); } else { BooleanClause[] clauses = ((BooleanQuery) q1).getClauses(); if (clauses[0].getOccur() == Occur.MUST) { for (BooleanClause c : clauses) { resultQuery.add(c); } } else { resultQuery.add(q1, Occur.MUST); } } if (q2 instanceof TermQuery) { resultQuery.add(q2, Occur.MUST); } else { BooleanClause[] clauses = ((BooleanQuery) q2).getClauses(); if (clauses[0].getOccur() == Occur.MUST) { for (BooleanClause c : clauses) { resultQuery.add(c); } } else { resultQuery.add(q2, Occur.MUST); } } } else if ('|' == op.type) { if (q1 instanceof TermQuery) { resultQuery.add(q1, Occur.SHOULD); } else { BooleanClause[] clauses = ((BooleanQuery) q1).getClauses(); if (clauses[0].getOccur() == Occur.SHOULD) { for (BooleanClause c : clauses) { resultQuery.add(c); } } else { resultQuery.add(q1, Occur.SHOULD); } } if (q2 instanceof TermQuery) { resultQuery.add(q2, Occur.SHOULD); } else { BooleanClause[] clauses = ((BooleanQuery) q2).getClauses(); if (clauses[0].getOccur() == Occur.SHOULD) { for (BooleanClause c : clauses) { resultQuery.add(c); } } else { resultQuery.add(q2, Occur.SHOULD); } } } else if ('-' == op.type) { if (q1 instanceof TermQuery) { resultQuery.add(q1, Occur.MUST); } else { BooleanClause[] clauses = ((BooleanQuery) q1).getClauses(); for (BooleanClause c : clauses) { resultQuery.add(c); } } resultQuery.add(q2, Occur.MUST_NOT); } return resultQuery; } /** * ? * @param e1 * @param e2 * @return */ private int compare(Element e1, Element e2) { if ('&' == e1.type) { if ('&' == e2.type) { return 0; } else { return 1; } } else if ('|' == e1.type) { if ('&' == e2.type) { return -1; } else if ('|' == e2.type) { return 0; } else { return 1; } } else { if ('-' == e2.type) { return 0; } else { return -1; } } } /** * ? * * @author linliangyi * May 20, 2010 */ private class Element { char type = 0; StringBuffer eleTextBuff; public Element() { eleTextBuff = new StringBuffer(); } public void append(char c) { this.eleTextBuff.append(c); } public String toString() { return this.eleTextBuff.toString(); } } } }