de.ingrid.search.utils.facet.FacetClassProducer.java Source code

Java tutorial

Introduction

Here is the source code for de.ingrid.search.utils.facet.FacetClassProducer.java

Source

/*
 * **************************************************-
 * ingrid-search-utils
 * ==================================================
 * Copyright (C) 2014 - 2015 wemove digital solutions GmbH
 * ==================================================
 * Licensed under the EUPL, Version 1.1 or  as soon they will be
 * approved by the European Commission - subsequent versions of the
 * EUPL (the "Licence");
 * 
 * You may not use this work except in compliance with the Licence.
 * You may obtain a copy of the Licence at:
 * 
 * http://ec.europa.eu/idabc/eupl5
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the Licence is distributed on an "AS IS" basis,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the Licence for the specific language governing permissions and
 * limitations under the Licence.
 * **************************************************#
 */
package de.ingrid.search.utils.facet;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.log4j.Logger;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.util.OpenBitSet;

import de.ingrid.search.utils.IQueryParsers;
import de.ingrid.search.utils.LuceneIndexReaderWrapper;
import de.ingrid.utils.query.FieldQuery;
import de.ingrid.utils.query.IngridQuery;
import de.ingrid.utils.queryparser.ParseException;
import de.ingrid.utils.queryparser.QueryStringParser;

/**
 * This class produces a FacetClass according to a FacetDefinition. It will
 * create a query to search the index for creating a bitset, which contains all
 * document ids for a FacetClass.
 */
public class FacetClassProducer {

    // the maximum number of values for an index field
    private static final int MAX_NUM = 300;

    private static Logger LOG = Logger.getLogger(FacetClassProducer.class);

    private LuceneIndexReaderWrapper indexReaderWrapper;

    private IQueryParsers _queryParsers;

    public FacetClassProducer() {
    }

    public FacetClass produceClass(FacetClassDefinition facetClassDef) {
        FacetClass fc = null;
        try {
            long start = 0;
            if (LOG.isDebugEnabled()) {
                start = System.currentTimeMillis();
            }
            if (facetClassDef.getFragment() != null) {
                Query q = getLuceneQuery(facetClassDef.getFragment());
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Create facet class '" + facetClassDef.getName() + "' with fragment '"
                            + facetClassDef.getFragment() + "' from query '" + q + "'.");
                }
                fc = produceClassFromQuery(facetClassDef.getName(), q);
            } else {
                LOG.warn("Create EMPTY facet class '" + facetClassDef.getName() + "'. No fragment set.");
                fc = new FacetClass(facetClassDef.getName(),
                        new OpenBitSet[indexReaderWrapper.getIndexReader().length]);
            }
            if (LOG.isDebugEnabled()) {
                LOG.debug("Create facet class template for '" + fc + "' in " + (System.currentTimeMillis() - start)
                        + " ms.");
            }
        } catch (IOException e) {
            LOG.error("Error producing facet class '" + facetClassDef.getName() + "'.", e);
        } catch (ParseException e) {
            LOG.error("Error producing facet class '" + facetClassDef.getName() + "'.", e);
        }

        return fc;
    }

    public List<FacetClass> produceClasses(FacetDefinition facetDef) {
        List<FacetClass> fClasses = new ArrayList<FacetClass>();
        try {
            if (facetDef.getQueryFragment() == null) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Create classes from index field '" + facetDef.getField() + "'.");
                }
                // presume we have a single field definition
                TermInfo[] tis = getHighFreqTerms(MAX_NUM, facetDef.getField());
                for (TermInfo ti : tis) {
                    long start = 0;
                    if (LOG.isInfoEnabled()) {
                        start = System.currentTimeMillis();
                    }
                    fClasses.add(produceClassFromQuery(ti.term.field() + ":" + ti.term.text(),
                            getLuceneQuery(ti.term.field() + ":" + ti.term.text())));
                    if (LOG.isInfoEnabled()) {
                        LOG.info("Create facet class: " + fClasses.get(fClasses.size() - 1) + " in "
                                + (System.currentTimeMillis() - start) + " ms.");
                    }
                }
            } else {

                // we have a query fragment
                Query query = getLuceneQuery(facetDef.getQueryFragment());
                OpenBitSet[] bitSets = FacetUtils.getBitSetsFromQuery(query, indexReaderWrapper);
                Map<Term, Integer> tiq = new HashMap<Term, Integer>();
                for (int i = 0; i < bitSets.length; i++) {
                    IndexReader indexReader = indexReaderWrapper.getIndexReader()[i];
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Read terms from field '" + facetDef.getField() + "' for bitset " + i);
                    }
                    TermEnum termEnum = indexReader.terms(new Term(facetDef.getField(), ""));
                    // iterate through all the values of this facet and see look
                    // at number of hits per term
                    try {
                        TermDocs termDocs = indexReader.termDocs();
                        // open termDocs only once, and use seek: this is more
                        // efficient
                        try {
                            do {
                                Term term = termEnum.term();
                                if (LOG.isDebugEnabled()) {
                                    LOG.debug("Term found: '" + term.toString() + "' [term-field: " + term.field()
                                            + ", facet-field:" + facetDef.getField() + "].");
                                }
                                int count = 0;
                                int minFreq = 0;
                                if (term != null && term.field().equals(facetDef.getField())) {
                                    termDocs.seek(term);
                                    while (termDocs.next()) {
                                        if (bitSets[i].get(termDocs.doc())) {
                                            count++;
                                        }
                                    }
                                    if (LOG.isDebugEnabled()) {
                                        LOG.debug("Occurence found:" + count);
                                    }
                                    if (count > 0) {
                                        if (!"".equals(term.text())) {
                                            if (count > minFreq) {
                                                tiq.put(term, count);
                                                if (tiq.size() > MAX_NUM) // if
                                                // tiq
                                                // overfull
                                                {
                                                    // find and remove minimal
                                                    // term to ensure capacity
                                                    // of
                                                    Term minTerm = null;
                                                    for (Term t : tiq.keySet()) {
                                                        if (minTerm == null) {
                                                            minFreq = tiq.get(t);
                                                            minTerm = t;
                                                        }
                                                        if (minFreq > tiq.get(t)) {
                                                            minFreq = tiq.get(t);
                                                            minTerm = t;
                                                        }
                                                    }
                                                    tiq.remove(minTerm);
                                                }
                                            }
                                        }
                                    }

                                } else {
                                    break;
                                }
                            } while (termEnum.next());
                        } finally {
                            termDocs.close();
                        }
                    } finally {
                        termEnum.close();
                    }

                    TermInfo[] res = new TermInfo[tiq.size()];
                    int cnt = 0;
                    for (Term t : tiq.keySet()) {
                        res[cnt] = new TermInfo(t, tiq.get(t));
                        cnt++;
                    }
                    Arrays.sort(res, new TermInfoComparator());

                    for (TermInfo ti : res) {
                        long start = 0;
                        if (LOG.isInfoEnabled()) {
                            start = System.currentTimeMillis();
                        }
                        fClasses.add(produceClassFromQuery(ti.term.field() + ":" + ti.term.text(), getLuceneQuery(
                                facetDef.getQueryFragment() + " " + ti.term.field() + ":" + ti.term.text())));
                        if (LOG.isInfoEnabled()) {
                            LOG.info("Create facet class: " + fClasses.get(fClasses.size() - 1) + " in "
                                    + (System.currentTimeMillis() - start) + " ms.");
                        }
                    }

                }

            }

        } catch (ParseException e) {
            LOG.error("Error producing facet classes from facet '" + facetDef.getName() + "'.", e);
        } catch (Exception e) {
            LOG.error("Error producing facet classes from facet '" + facetDef.getName() + "'.", e);
        }
        return fClasses;
    }

    public FacetClass produceClassFromQuery(String name, Query query) throws IOException {
        return new FacetClass(name, FacetUtils.getBitSetsFromQuery(query, indexReaderWrapper));
    }

    public OpenBitSet[] getBitSetFromQuery(IngridQuery ingridQuery) {
        return FacetUtils.getBitSetsFromQuery(_queryParsers.parse(ingridQuery), indexReaderWrapper);
    }

    public LuceneIndexReaderWrapper getIndexReaderWrapper() {
        return indexReaderWrapper;
    }

    public void setIndexReaderWrapper(LuceneIndexReaderWrapper indexReaderWrapper) {
        this.indexReaderWrapper = indexReaderWrapper;
    }

    public IQueryParsers get_queryParsers() {
        return _queryParsers;
    }

    public void setQueryParsers(IQueryParsers queryParsers) {
        _queryParsers = queryParsers;
    }

    private Query getLuceneQuery(String definition) throws ParseException {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Start getting LuceneQuery from IngridQuery String: " + definition);
        }
        IngridQuery iq = QueryStringParser.parse(definition);
        // new TermQuery(new Term(facet[0],facet[1])));
        Query q = _queryParsers.parse(iq);
        if (LOG.isDebugEnabled()) {
            LOG.debug("Resulting lucene query after parsing: " + q);
        }
        q = addSpecialFields(q, iq);
        return q;
    }

    @SuppressWarnings("unchecked")
    private BooleanQuery addSpecialFields(Query q, IngridQuery iq) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Add special fields (partner, provider, datatype) from IngridQuery: " + iq);
        }

        BooleanQuery bq = new BooleanQuery();
        List<FieldQuery> partners = iq.getArrayList("partner");
        if (partners != null) {
            for (FieldQuery partner : partners) {
                Term luceneTerm = new Term(partner.getFieldName(), partner.getFieldValue());
                TermQuery luceneTermQuery = new TermQuery(luceneTerm);
                bq.add(luceneTermQuery, Occur.MUST);
            }
        }
        List<FieldQuery> providers = iq.getArrayList("provider");
        if (providers != null) {
            for (FieldQuery provider : providers) {
                Term luceneTerm = new Term(provider.getFieldName(), provider.getFieldValue());
                TermQuery luceneTermQuery = new TermQuery(luceneTerm);
                bq.add(luceneTermQuery, Occur.MUST);
            }
        }
        List<FieldQuery> datatypes = iq.getArrayList("datatype");
        if (datatypes != null) {
            for (FieldQuery datatype : datatypes) {
                Term luceneTerm = new Term(datatype.getFieldName(), datatype.getFieldValue());
                TermQuery luceneTermQuery = new TermQuery(luceneTerm);
                bq.add(luceneTermQuery, Occur.MUST);
            }
        }
        if (q.toString().length() != 0)
            bq.add(q, Occur.MUST);
        return bq;

    }

    private TermInfo[] getHighFreqTerms(int numTerms, String field) throws Exception {

        if (indexReaderWrapper == null || field == null)
            return null;
        Map<Term, Integer> tiq = new HashMap<Term, Integer>();

        for (IndexReader indexReader : indexReaderWrapper.getIndexReader()) {

            TermEnum terms = indexReader.terms(new Term(field, ""));

            if (terms == null || terms.term() == null || !terms.term().field().equals(field)) {
                continue;
            }

            int minFreq = 0;
            boolean skip = false;
            while (skip == false) {
                int docFreq = terms.docFreq();
                if (tiq.containsKey(terms.term())) {
                    docFreq += tiq.get(terms.term());
                }
                if (docFreq > minFreq) {
                    tiq.put(terms.term(), docFreq);
                    if (tiq.size() > numTerms) // if tiq overfull
                    {
                        // find and remove minimal term to ensure capacity of
                        Term minTerm = null;
                        for (Term t : tiq.keySet()) {
                            if (minTerm == null) {
                                minFreq = tiq.get(t);
                                minTerm = t;
                            }
                            if (minFreq > tiq.get(t)) {
                                minFreq = tiq.get(t);
                                minTerm = t;
                            }
                        }
                        tiq.remove(minTerm);
                    }
                }
                if (terms.next()) {
                    String fld = terms.term().field();
                    // skip if another field is reached, we only want the terms
                    // of one field
                    if (!fld.equals(field)) {
                        skip = true;
                    }
                } else {
                    skip = true;
                }
            }
        }
        TermInfo[] res = new TermInfo[tiq.size()];
        int cnt = 0;
        for (Term t : tiq.keySet()) {
            res[cnt] = new TermInfo(t, tiq.get(t));
            cnt++;
        }
        Arrays.sort(res, new TermInfoComparator());
        return res;
    }

    private class TermInfoComparator implements Comparator<TermInfo> {

        @Override
        public int compare(TermInfo ti1, TermInfo ti2) {
            if (ti1.docFreq < ti2.docFreq)
                return -1;
            if (ti1.docFreq > ti2.docFreq)
                return 1;
            return 0;
        }

    }

    private class TermInfo {
        public Term term;
        public int docFreq;

        public TermInfo(Term t, int df) {
            this.term = t;
            this.docFreq = df;
        }
    }

}