com.bizosys.hsearch.outpipe.HQueryParser.java Source code

Introduction

Here is the source code for com.bizosys.hsearch.outpipe.HQueryParser.java
Source

/*
* Copyright 2010 Bizosys Technologies Limited
*
* Licensed to the Bizosys Technologies Limited (Bizosys) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The Bizosys licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.bizosys.hsearch.outpipe;

import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import com.bizosys.hsearch.inpipe.util.StopwordManager;
import com.bizosys.hsearch.query.HQuery;
import com.bizosys.hsearch.query.QueryContext;
import com.bizosys.hsearch.query.QueryPlanner;
import com.bizosys.hsearch.query.QueryTerm;
import com.bizosys.hsearch.query.ReserveQueryWord;
import com.bizosys.hsearch.util.LuceneConstants;
import com.bizosys.oneline.ApplicationFault;
import com.bizosys.oneline.SystemFault;
import com.bizosys.oneline.conf.Configuration;
import com.bizosys.oneline.pipes.PipeOut;
import com.bizosys.oneline.util.StringUtils;

/**
 * Internally it uses LuceneQueryParser for parsing the query.
 * Once parsed, based on the reserve words, it builds the 
 * query execution plan.
 * @author karan
 *
 */
public class HQueryParser implements PipeOut {

    public HQueryParser() {
    }

    public void visit(Object objQuery, boolean multiWriter) throws ApplicationFault, SystemFault {

        HQuery query = (HQuery) objQuery;
        QueryContext ctx = query.ctx;
        QueryPlanner planner = query.planner;

        if (null == ctx || null == ctx.queryString) {
            throw new ApplicationFault("Blank Query ");
        }

        if (OutpipeLog.l.isDebugEnabled())
            OutpipeLog.l.debug("Query String = " + ctx.queryString);

        parse(ctx.queryString, planner, ctx);
    }

    public void commit(boolean multiWriter) throws ApplicationFault, SystemFault {
    }

    public PipeOut getInstance() {
        return this;
    }

    public void init(Configuration conf) throws ApplicationFault, SystemFault {
    }

    public String getName() {
        return "HQueryParser";
    }

    private static void parse(String text, QueryPlanner planner, QueryContext hq)
            throws ApplicationFault, SystemFault {

        text = text.toLowerCase();
        List<Section> splits = quotedText(text, '"');
        List<String> words = tokenize(text, splits);

        List<QueryTerm> lastTerms = new ArrayList<QueryTerm>(3);

        Iterator<String> itr = words.iterator();
        boolean optionalMode = true;
        boolean isNot = false;
        Set<String> stopwords = StopwordManager.getInstance().getStopwords();
        while (itr.hasNext()) {
            String word = itr.next();
            word = word.trim();
            if (word.length() == 0)
                continue;

            /**
             * Don't proceed if this is a stopword
             */

            if ("and".equals(word)) {
                if (0 != lastTerms.size() && null != planner.optionalTerms) {
                    for (QueryTerm lastTerm : lastTerms) {
                        if (planner.optionalTerms.contains(lastTerm)) {
                            planner.optionalTerms.remove(lastTerm);
                            planner.addMustTerm(lastTerm);
                            OutpipeLog.l.trace(lastTerm);
                        }
                    }
                }
                optionalMode = false;

            } else if ("or".equals(word)) {
                if (0 != lastTerms.size() && null != planner.mustTerms) {
                    for (QueryTerm lastTerm : lastTerms) {
                        if (planner.mustTerms.contains(lastTerm)) {
                            planner.mustTerms.remove(lastTerm);
                            planner.addOptionalTerm(lastTerm);
                        }
                    }
                }
                optionalMode = true;
            } else if ("not".equals(word)) {
                isNot = true;
            } else {
                if (stopwords.contains(word))
                    continue;
                char firstChar = word.charAt(0);
                switch (firstChar) {
                case '+':
                    optionalMode = false;
                    word = word.substring(1);
                    break;
                case '-':
                    optionalMode = true;
                    word = word.substring(1);
                    break;
                case '!':
                    isNot = true;
                    word = word.substring(1);
                    break;
                default:
                }

                lastTerms.clear();

                //Make the base term
                QueryTerm reserveTerm = new QueryTerm(word, isNot);
                int reserveWord = ReserveQueryWord.getInstance().mapReserveWord(reserveTerm.termType);
                if (ReserveQueryWord.NO_RESERVE_WORD != reserveWord) {
                    hq.populate(reserveWord, reserveTerm.wordOrig);
                    lastTerms.add(reserveTerm);
                } else {
                    planner.addPhrase(reserveTerm);
                    List<String> lstWord = standardTokenizer(reserveTerm.wordOrig);
                    for (String aWord : lstWord) {
                        QueryTerm term = new QueryTerm(aWord, isNot);
                        term.setTermType(reserveTerm.termType);
                        if (optionalMode)
                            planner.addOptionalTerm(term);
                        else
                            planner.addMustTerm(term);
                        hq.totalTerms++;
                        lastTerms.add(term); //Make it a Array
                    }
                }

                //Refresh the settings
                optionalMode = true;
                isNot = false;
            }
        }

        lastTerms.clear();
        lastTerms = null;

        if (null != planner.optionalTerms && null == planner.mustTerms && planner.optionalTerms.size() == 1) {
            planner.addMustTerm(planner.optionalTerms.get(0));
            planner.optionalTerms.clear();
            planner.optionalTerms = null;
        }

        if (OutpipeLog.l.isDebugEnabled()) {
            OutpipeLog.l.debug("Planner: " + planner.toString());
        }
    }

    private static List<String> standardTokenizer(String word) throws ApplicationFault {
        Reader reader = new StringReader(word);
        StandardTokenizer fil = new StandardTokenizer(LuceneConstants.version, reader);
        List<String> lstWord = new ArrayList<String>(3);
        try {
            word = null;
            CharTermAttribute termA = (CharTermAttribute) fil.getAttribute(CharTermAttribute.class);
            fil.reset();

            while (fil.incrementToken()) {
                word = termA.toString();
                lstWord.add(word);
            }
            reader.close();
        } catch (Exception ex) {
            throw new ApplicationFault(ex);
        }
        return lstWord;
    }

    private static List<String> tokenize(String text, List<Section> splits) {
        int lastIndex = 0;
        List<String> words = null;
        ;
        int textLastIndex = text.length() - 1;
        String phrase = null;

        for (Section is : splits) {

            //Not a phrase start, Take the section
            if (lastIndex != (is.start - 1)) {
                List<String> splittedWords = spaceTokenizer(text.substring(lastIndex, is.start - 1).trim(), ' ');
                if (null != phrase) {
                    appendPhrase(phrase, splittedWords);
                    phrase = null;
                }
                if (null == words)
                    words = new ArrayList<String>(splits.size() * 2 + 1);
                words.addAll(splittedWords);
            }

            //Extract the phase
            phrase = text.substring(is.start, is.end);
            boolean isIsolatedStart = ((is.start - 1) == 0 || (is.start > 1 && text.charAt(is.start - 2) == ' '));
            boolean isIsolatedEnd = (is.end == textLastIndex) || (text.charAt(is.end + 1) == ' ');

            if (isIsolatedStart && isIsolatedEnd) {
                if (null == words)
                    words = new ArrayList<String>(splits.size() * 2 + 1);
                words.add(phrase);
                phrase = null;
            } else if (isIsolatedEnd) {
                if (null == words) {
                    words = new ArrayList<String>(splits.size() * 2 + 1);
                    words.add(phrase);
                } else {
                    int lst = words.size() - 1;
                    String word = words.get(lst);
                    words.remove(lst);
                    words.add(word + phrase);
                }
                phrase = null;
            }

            lastIndex = is.end + 1;
        }

        //Remaining last section
        if (lastIndex <= textLastIndex) {
            List<String> splittedWords = StringUtils.fastSplit(text.substring(lastIndex), ' ');
            if (null != phrase) {
                appendPhrase(phrase, splittedWords);
                phrase = null;
            }
            if (null == words)
                words = new ArrayList<String>(splits.size() * 2 + 1);
            words.addAll(splittedWords);
        } else if (null != phrase) {
            if (null == words)
                words = new ArrayList<String>(splits.size() * 2 + 1);
            words.add(phrase);
            phrase = null;
        }
        return words;
    }

    private static void appendPhrase(String phrase, List<String> splittedWords) {
        if (null != phrase) {
            if (splittedWords.size() > 0) {
                phrase = phrase + splittedWords.get(0);
                splittedWords.remove(0);
                splittedWords.add(0, phrase);
            } else {
                splittedWords.add(phrase);
            }
            phrase = null;
        }
    }

    public static List<Section> quotedText(final String text, char separator) {

        final List<Section> result = new ArrayList<Section>();
        int index1 = text.indexOf(separator);
        ;
        int index2 = 0;
        int lastIndex = text.length() - 1;

        while (index1 >= 0) {
            if ((-1 == index1) || (index1 >= lastIndex))
                break;
            index2 = text.indexOf(separator, index1 + 1);
            if (-1 == index2)
                break;
            result.add(new Section(index1 + 1, index2));
            index1 = text.indexOf(separator, index2 + 1);
        }
        return result;
    }

    public static List<String> spaceTokenizer(final String text, char separator) {
        final List<String> result = new ArrayList<String>();
        int index1 = 0;
        int index2 = text.indexOf(separator);
        String token = null;
        if (index2 == -1) {
            result.add(text);
            return result;
        }

        while (index2 >= 0) {
            token = text.substring(index1, index2);
            result.add(token);
            index1 = index2 + 1;
            index2 = text.indexOf(separator, index1);
        }

        if (index1 < text.length() - 1) {
            result.add(text.substring(index1));
        }
        return result;
    }

    public static class Section {
        int start;
        int end;

        public Section(int start, int end) {
            this.start = start;
            this.end = end;
        }

        @Override
        public String toString() {
            return this.start + "-" + this.end;
        }
    }
}