net.yacy.cora.federate.solr.responsewriter.GSAResponseWriter.java Source code

Introduction

Here is the source code for net.yacy.cora.federate.solr.responsewriter.GSAResponseWriter.java
Source

/**
 *  GSAResponseWriter
 *  Copyright 2012 by Michael Peter Christen
 *  First released 14.08.2012 at http://yacy.net
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package net.yacy.cora.federate.solr.responsewriter;

import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.util.CommonPattern;
import net.yacy.peers.operation.yacyVersion;
import net.yacy.search.Switchboard;
import net.yacy.search.schema.CollectionSchema;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexableField;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.common.util.XML;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.QueryResponseWriter;
import org.apache.solr.response.ResultContext;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;

/**
 * implementation of a GSA search result.
 * example: GET /gsa/searchresult?q=chicken+teriyaki&output=xml&client=test&site=test&sort=date:D:S:d1
 * for a xml reference, see https://developers.google.com/search-appliance/documentation/614/xml_reference
 */
public class GSAResponseWriter implements QueryResponseWriter {

    private static String YaCyVer = null;
    private static final char lb = '\n';

    private enum GSAToken {
        CACHE_LAST_MODIFIED, // Date that the document was crawled, as specified in the Date HTTP header when the document was crawled for this index.
        CRAWLDATE, // An optional element that shows the date when the page was crawled. It is shown only for pages that have been crawled within the past two days.
        U, // The URL of the search result.
        UE, // The URL-encoded version of the URL that is in the U parameter.
        GD, // Contains the description of a KeyMatch result..
        T, // The title of the search result.
        RK, // Provides a ranking number used internally by the search appliance.
        ENT_SOURCE, // Identifies the application ID (serial number) of the search appliance that contributes to a result. Example: <ENT_SOURCE>S5-KUB000F0ADETLA</ENT_SOURCE>
        FS, // Additional details about the search result.
        R, // details of an individual search result.
        S, // The snippet for the search result. Query terms appear in bold in the results. Line breaks are included for proper text wrapping.
        LANG, // Indicates the language of the search result. The LANG element contains a two-letter language code.
        HAS; // Encapsulates special features that are included for this search result.
    }

    private static final char[] XML_START = ("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n<GSP VER=\"3.2\">\n<!-- This is a Google Search Appliance API result, provided by YaCy. See https://developers.google.com/search-appliance/documentation/614/xml_reference -->\n")
            .toCharArray();
    private static final char[] XML_STOP = "</GSP>\n".toCharArray();

    // define a list of simple YaCySchema -> RSS Token matchings
    private static final Map<String, String> field2tag = new HashMap<String, String>();

    // pre-select a set of YaCy schema fields for the solr searcher which should cause a better caching
    private static final CollectionSchema[] extrafields = new CollectionSchema[] { CollectionSchema.id,
            CollectionSchema.sku, CollectionSchema.title, CollectionSchema.description_txt,
            CollectionSchema.last_modified, CollectionSchema.load_date_dt, CollectionSchema.size_i,
            CollectionSchema.language_s, CollectionSchema.collection_sxt };

    private static final Set<String> SOLR_FIELDS = new HashSet<String>();
    static {
        field2tag.put(CollectionSchema.language_s.getSolrFieldName(), GSAToken.LANG.name());
        SOLR_FIELDS.addAll(field2tag.keySet());
        for (CollectionSchema field : extrafields)
            SOLR_FIELDS.add(field.getSolrFieldName());
    }

    private static class ResHead {
        public int offset, rows, numFound;
        //public int status, QTime;
        //public String df, q, wt;
        //public float maxScore;
    }

    public static class Sort {
        public String sort = null, action = null, direction = null, mode = null, format = null;

        public Sort(String d) {
            this.sort = d;
            String[] s = CommonPattern.DOUBLEPOINT.split(d);
            if (s.length < 1)
                return;
            this.action = s[0]; // date
            this.direction = s.length > 1 ? s[1] : "D"; // A or D
            this.mode = s.length > 2 ? s[2] : "S"; // S, R, L
            this.format = s.length > 3 ? s[3] : "d1"; // d1
        }

        public String toSolr() {
            if (this.action != null && "date".equals(this.action)) {
                return CollectionSchema.last_modified.getSolrFieldName() + " "
                        + (("D".equals(this.direction) ? "desc" : "asc"));
            }
            return null;
        }
    }

    public GSAResponseWriter() {
        super();
    }

    @Override
    public String getContentType(final SolrQueryRequest request, final SolrQueryResponse response) {
        return CONTENT_TYPE_XML_UTF8;
    }

    @Override
    public void init(@SuppressWarnings("rawtypes") NamedList n) {
    }

    @Override
    public void write(final Writer writer, final SolrQueryRequest request, final SolrQueryResponse rsp)
            throws IOException {
        assert rsp.getValues().get("responseHeader") != null;
        assert rsp.getValues().get("response") != null;

        long start = System.currentTimeMillis();

        SimpleOrderedMap<Object> responseHeader = (SimpleOrderedMap<Object>) rsp.getResponseHeader();
        DocList response = ((ResultContext) rsp.getValues().get("response")).docs;
        @SuppressWarnings("unchecked")
        SimpleOrderedMap<Object> highlighting = (SimpleOrderedMap<Object>) rsp.getValues().get("highlighting");
        Map<String, LinkedHashSet<String>> snippets = OpensearchResponseWriter.highlighting(highlighting);
        Map<Object, Object> context = request.getContext();

        // parse response header
        ResHead resHead = new ResHead();
        NamedList<?> val0 = (NamedList<?>) responseHeader.get("params");
        resHead.rows = Integer.parseInt((String) val0.get(CommonParams.ROWS));
        resHead.offset = response.offset(); // equal to 'start'
        resHead.numFound = response.matches();
        //resHead.df = (String) val0.get("df");
        //resHead.q = (String) val0.get("q");
        //resHead.wt = (String) val0.get("wt");
        //resHead.status = (Integer) responseHeader.get("status");
        //resHead.QTime = (Integer) responseHeader.get("QTime");
        //resHead.maxScore = response.maxScore();

        // write header
        writer.write(XML_START);
        String query = request.getParams().get("originalQuery");
        String site = getContextString(context, "site", "");
        String sort = getContextString(context, "sort", "");
        String client = getContextString(context, "client", "");
        String ip = getContextString(context, "ip", "");
        String access = getContextString(context, "access", "");
        String entqr = getContextString(context, "entqr", "");
        OpensearchResponseWriter.solitaireTag(writer, "TM", Long.toString(System.currentTimeMillis() - start));
        OpensearchResponseWriter.solitaireTag(writer, "Q", query);
        paramTag(writer, "sort", sort);
        paramTag(writer, "output", "xml_no_dtd");
        paramTag(writer, "ie", "UTF-8");
        paramTag(writer, "oe", "UTF-8");
        paramTag(writer, "client", client);
        paramTag(writer, "q", query);
        paramTag(writer, "site", site);
        paramTag(writer, "start", Integer.toString(resHead.offset));
        paramTag(writer, "num", Integer.toString(resHead.rows));
        paramTag(writer, "ip", ip);
        paramTag(writer, "access", access); // p - search only public content, s - search only secure content, a - search all content, both public and secure
        paramTag(writer, "entqr", entqr); // query expansion policy; (entqr=1) -- Uses only the search appliance's synonym file, (entqr=1) -- Uses only the search appliance's synonym file, (entqr=3) -- Uses both standard and local synonym files.

        // body introduction
        final int responseCount = response.size();
        writer.write("<RES SN=\"" + (resHead.offset + 1) + "\" EN=\"" + (resHead.offset + responseCount) + "\">");
        writer.write(lb); // The index (1-based) of the first and last search result returned in this result set.
        writer.write("<M>" + resHead.numFound + "</M>");
        writer.write(lb); // The estimated total number of results for the search.
        writer.write("<FI/>");
        writer.write(lb); // Indicates that document filtering was performed during this search.
        int nextStart = resHead.offset + responseCount;
        int nextNum = Math.min(resHead.numFound - nextStart, responseCount < resHead.rows ? 0 : resHead.rows);
        int prevStart = resHead.offset - resHead.rows;
        if (prevStart >= 0 || nextNum > 0) {
            writer.write("<NB>");
            if (prevStart >= 0) {
                writer.write("<PU>");
                XML.escapeCharData("/gsa/search?q=" + request.getParams().get(CommonParams.Q) + "&site=" + site
                        + "&lr=&ie=UTF-8&oe=UTF-8&output=xml_no_dtd&client=" + client + "&access=" + access
                        + "&sort=" + sort + "&start=" + prevStart + "&sa=N", writer); // a relative URL pointing to the NEXT results page.
                writer.write("</PU>");
            }
            if (nextNum > 0) {
                writer.write("<NU>");
                XML.escapeCharData("/gsa/search?q=" + request.getParams().get(CommonParams.Q) + "&site=" + site
                        + "&lr=&ie=UTF-8&oe=UTF-8&output=xml_no_dtd&client=" + client + "&access=" + access
                        + "&sort=" + sort + "&start=" + nextStart + "&num=" + nextNum + "&sa=N", writer); // a relative URL pointing to the NEXT results page.
                writer.write("</NU>");
            }
            writer.write("</NB>");
        }
        writer.write(lb);

        // parse body
        SolrIndexSearcher searcher = request.getSearcher();
        DocIterator iterator = response.iterator();
        String urlhash = null;
        for (int i = 0; i < responseCount; i++) {
            int id = iterator.nextDoc();
            Document doc = searcher.doc(id, SOLR_FIELDS);
            List<IndexableField> fields = doc.getFields();

            // pre-scan the fields to get the mime-type            
            String mime = "";
            for (IndexableField value : fields) {
                String fieldName = value.name();
                if (CollectionSchema.content_type.getSolrFieldName().equals(fieldName)) {
                    mime = value.stringValue();
                    break;
                }
            }

            // write the R header for a search result
            writer.write("<R N=\"" + (resHead.offset + i + 1) + "\"" + (i == 1 ? " L=\"2\"" : "")
                    + (mime != null && mime.length() > 0 ? " MIME=\"" + mime + "\"" : "") + ">");
            writer.write(lb);
            //List<String> texts = new ArrayList<String>();
            List<String> descriptions = new ArrayList<String>();
            List<String> collections = new ArrayList<String>();
            int size = 0;
            boolean title_written = false; // the solr index may contain several; we take only the first which should be the visible tag in <title></title>
            String title = null;
            for (IndexableField value : fields) {
                String fieldName = value.name();

                // apply generic matching rule
                String stag = field2tag.get(fieldName);
                if (stag != null) {
                    OpensearchResponseWriter.solitaireTag(writer, stag, value.stringValue());
                    continue;
                }

                // if the rule is not generic, use the specific here
                if (CollectionSchema.id.getSolrFieldName().equals(fieldName)) {
                    urlhash = value.stringValue();
                    continue;
                }
                if (CollectionSchema.sku.getSolrFieldName().equals(fieldName)) {
                    OpensearchResponseWriter.solitaireTag(writer, GSAToken.U.name(), value.stringValue());
                    OpensearchResponseWriter.solitaireTag(writer, GSAToken.UE.name(), value.stringValue());
                    continue;
                }
                if (CollectionSchema.title.getSolrFieldName().equals(fieldName) && !title_written) {
                    title = value.stringValue();
                    OpensearchResponseWriter.solitaireTag(writer, GSAToken.T.name(), highlight(title, query));
                    //texts.add(value.stringValue());
                    title_written = true;
                    continue;
                }
                if (CollectionSchema.description_txt.getSolrFieldName().equals(fieldName)) {
                    descriptions.add(value.stringValue());
                    //texts.adds(description);
                    continue;
                }
                if (CollectionSchema.last_modified.getSolrFieldName().equals(fieldName)) {
                    Date d = new Date(Long.parseLong(value.stringValue()));
                    writer.write("<FS NAME=\"date\" VALUE=\"" + HeaderFramework.formatGSAFS(d) + "\"/>\n");
                    //OpensearchResponseWriter.solitaireTag(writer, GSAToken.CACHE_LAST_MODIFIED.getSolrFieldName(), HeaderFramework.formatRFC1123(d));
                    //texts.add(value.stringValue());
                    continue;
                }
                if (CollectionSchema.load_date_dt.getSolrFieldName().equals(fieldName)) {
                    Date d = new Date(Long.parseLong(value.stringValue()));
                    OpensearchResponseWriter.solitaireTag(writer, GSAToken.CRAWLDATE.name(),
                            HeaderFramework.formatRFC1123(d));
                    //texts.add(value.stringValue());
                    continue;
                }
                if (CollectionSchema.size_i.getSolrFieldName().equals(fieldName)) {
                    size = value.stringValue() != null && value.stringValue().length() > 0
                            ? Integer.parseInt(value.stringValue())
                            : -1;
                    continue;
                }
                if (CollectionSchema.collection_sxt.getSolrFieldName().equals(fieldName)) {
                    collections.add(value.stringValue());
                    continue;
                }
                //System.out.println("superfluous field: " + fieldName + ": " + value.stringValue()); // this can be avoided setting the enableLazyFieldLoading = false in solrconfig.xml
            }
            // compute snippet from texts
            LinkedHashSet<String> snippet = urlhash == null ? null : snippets.get(urlhash);
            OpensearchResponseWriter.removeSubsumedTitle(snippet, title);
            OpensearchResponseWriter.solitaireTag(writer, GSAToken.S.name(),
                    snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : "")
                            : OpensearchResponseWriter.getLargestSnippet(snippet));
            OpensearchResponseWriter.solitaireTag(writer, GSAToken.GD.name(),
                    descriptions.size() > 0 ? descriptions.get(0) : "");
            String cols = collections.toString();
            if (collections.size() > 0)
                OpensearchResponseWriter.solitaireTag(writer, "COLS" /*SPECIAL!*/,
                        collections.size() > 1 ? cols.substring(1, cols.length() - 1).replaceAll(" ", "")
                                : collections.get(0));
            writer.write("<HAS><L/><C SZ=\"");
            writer.write(Integer.toString(size / 1024));
            writer.write("k\" CID=\"");
            writer.write(urlhash);
            writer.write("\" ENC=\"UTF-8\"/></HAS>\n");
            if (YaCyVer == null)
                YaCyVer = yacyVersion.thisVersion().getName() + "/"
                        + Switchboard.getSwitchboard().peers.mySeed().hash;
            OpensearchResponseWriter.solitaireTag(writer, GSAToken.ENT_SOURCE.name(), YaCyVer);
            OpensearchResponseWriter.closeTag(writer, "R");
        }
        writer.write("</RES>");
        writer.write(lb);
        writer.write(XML_STOP);
    }

    private static String getContextString(Map<Object, Object> context, String key, String dflt) {
        Object v = context.get(key);
        if (v == null)
            return dflt;
        if (v instanceof String)
            return (String) v;
        if (v instanceof String[]) {
            String[] va = (String[]) v;
            return va.length == 0 ? dflt : va[0];
        }
        return dflt;
    }

    public static void paramTag(final Writer writer, final String tagname, String value) throws IOException {
        if (value == null || value.length() == 0)
            return;
        writer.write("<PARAM name=\"");
        writer.write(tagname);
        writer.write("\" value=\"");
        XML.escapeAttributeValue(value, writer);
        writer.write("\" original_value=\"");
        XML.escapeAttributeValue(value, writer);
        writer.write("\"/>");
        writer.write(lb);
    }

    public static String highlight(String text, String query) {
        if (query != null) {
            String[] q = CommonPattern.SPACE
                    .split(CommonPattern.PLUS.matcher(query.trim().toLowerCase()).replaceAll(" "));
            for (String s : q) {
                int p = text.toLowerCase().indexOf(s.toLowerCase());
                if (p < 0)
                    continue;
                text = text.substring(0, p) + "<b>" + text.substring(p, p + s.length()) + "</b>"
                        + text.substring(p + s.length());
            }
            return text.replaceAll(Pattern.quote("</b> <b>"), " ");
        }
        return text;
    }
}