ddf.catalog.cache.solr.impl.DynamicSchemaResolver.java Source code

Introduction

Here is the source code for ddf.catalog.cache.solr.impl.DynamicSchemaResolver.java
Source

/**
 * Copyright (c) Codice Foundation
 * 
 * This is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser
 * General Public License as published by the Free Software Foundation, either version 3 of the
 * License, or any later version.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
 * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details. A copy of the GNU Lesser General Public License
 * is distributed along with this program and can be found at
 * <http://www.gnu.org/licenses/lgpl.html>.
 * 
 **/
package ddf.catalog.cache.solr.impl;

import ddf.catalog.data.AttributeDescriptor;
import ddf.catalog.data.AttributeType.AttributeFormat;
import ddf.catalog.data.Metacard;
import ddf.catalog.data.MetacardCreationException;
import ddf.catalog.data.MetacardType;
import ddf.catalog.data.impl.MetacardTypeImpl;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.codehaus.stax2.XMLInputFactory2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

/**
 * This class tries to resolve all user given field names to their corresponding dynamic Solr index
 * field name. This class takes most of its logic directly from the configured Solr schema.xml. For
 * instance, the suffixes enumerated in this class are directly copied from the schema.xml.
 * 
 * @since 0.2.0
 */
public class DynamicSchemaResolver {

    protected static final char FIRST_CHAR_OF_SUFFIX = '_';

    protected static final String COULD_NOT_READ_METACARD_TYPE_MESSAGE = "Could not read MetacardType.";

    protected static final String FIELDS_KEY = "fields";

    protected static final String COULD_NOT_SERIALIZE_OBJECT_MESSAGE = "Could not serialize object";

    private static final String SOLR_CLOUD_VERSION_FIELD = "_version_";

    private static final List<String> PRIVATE_SOLR_FIELDS = Arrays.asList(SOLR_CLOUD_VERSION_FIELD,
            SchemaFields.METACARD_TYPE_FIELD_NAME, SchemaFields.METACARD_TYPE_OBJECT_FIELD_NAME);

    private static final Logger LOGGER = LoggerFactory.getLogger(DynamicSchemaResolver.class);

    public static final String LUX_XML_FIELD_NAME = "lux_xml";

    protected Set<String> fieldsCache = new HashSet<String>();

    protected SchemaFields schemaFields;

    protected Map<String, MetacardType> metacardTypesCache = new HashMap<String, MetacardType>();

    protected Map<String, byte[]> metacardTypeNameToSerialCache = new HashMap<String, byte[]>();

    protected static final XMLInputFactory xmlInputFactory;

    static {
        ClassLoader tccl = Thread.currentThread().getContextClassLoader();
        try {
            Thread.currentThread().setContextClassLoader(DynamicSchemaResolver.class.getClassLoader());

            xmlInputFactory = XMLInputFactory2.newInstance();
            xmlInputFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.FALSE);
            xmlInputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
            xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.FALSE);
            xmlInputFactory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE);
        } finally {
            Thread.currentThread().setContextClassLoader(tccl);
        }
    }

    public DynamicSchemaResolver() {
        this.schemaFields = new SchemaFields();

        fieldsCache.add(Metacard.ID + SchemaFields.TEXT_SUFFIX);
        fieldsCache.add(Metacard.ID + SchemaFields.TEXT_SUFFIX + SchemaFields.TOKENIZED);
        fieldsCache.add(Metacard.ID + SchemaFields.TEXT_SUFFIX + SchemaFields.TOKENIZED + SchemaFields.HAS_CASE);
    }

    /**
     * Adds the fields that are already in the server to the cache. This method should be called
     * once the SolrServer is up to ensure the cache is synchronized with the server.
     * 
     * @param server
     *            the SolrServer we are working with
     */
    public void addFieldsFromServer(SolrServer server) {
        if (server == null) {
            LOGGER.warn("Server is null, could not add fields to cache.");
            return;
        }

        SolrQuery query = new SolrQuery();

        // numterms=0 means retrieve everything (regular or dynamic fields)
        query.add("numterms", "0");

        /*
         * Adding this request handler allows us to query the schema dynamically. The name of the
         * request handler is provided by the schema.xml. If the name is changed in the schema.xml,
         * then this value must be changed as well.
         */
        query.setRequestHandler("/admin/luke");

        QueryResponse response = null;

        try {
            response = server.query(query);
            for (Entry<String, ?> e : ((SimpleOrderedMap<?>) (response.getResponse().get(FIELDS_KEY)))) {
                fieldsCache.add(e.getKey());
            }
        } catch (SolrServerException e) {
            LOGGER.warn("Could not update cache for field names.", e);
        } catch (SolrException e) {
            LOGGER.warn("Could not update cache for field names.", e);
        }
    }

    /**
     * Adds the fields of the Metacard into the {@link SolrInputDocument}
     * 
     * @param metacard
     * @param solrInputDocument
     * @throws IngestException
     */
    public void addFields(Metacard metacard, SolrInputDocument solrInputDocument) throws MetacardCreationException {
        MetacardType schema = metacard.getMetacardType();

        // TODO: register these metacard types when a new one is seen

        for (AttributeDescriptor ad : schema.getAttributeDescriptors()) {
            if (metacard.getAttribute(ad.getName()) != null) {
                Serializable attributeValue = metacard.getAttribute(ad.getName()).getValue();

                if (attributeValue != null) {
                    AttributeFormat format = ad.getType().getAttributeFormat();
                    String formatIndexName = ad.getName() + getFieldSuffix(format);

                    if (AttributeFormat.XML.equals(format)) {
                        // raw
                        solrInputDocument.addField(formatIndexName, attributeValue);

                        // text
                        String specialStringIndexName = ad.getName() + getFieldSuffix(AttributeFormat.STRING)
                                + getSpecialIndexSuffix(AttributeFormat.STRING);
                        String parsedText = parseTextFrom(attributeValue.toString());
                        solrInputDocument.addField(specialStringIndexName, parsedText);

                        // text case sensitive
                        solrInputDocument.addField(getCaseSensitiveField(specialStringIndexName), parsedText);
                    } else if (AttributeFormat.GEOMETRY.equals(format)) {
                        solrInputDocument.addField(formatIndexName, attributeValue);
                    } else if (AttributeFormat.OBJECT.equals(format)) {
                        ByteArrayOutputStream byteArrayOS = new ByteArrayOutputStream();
                        ObjectOutputStream out = null;

                        try {
                            out = new ObjectOutputStream(byteArrayOS);
                            out.writeObject(attributeValue);
                            out.close();
                        } catch (IOException e) {
                            LOGGER.warn(COULD_NOT_SERIALIZE_OBJECT_MESSAGE, e);
                            throw new MetacardCreationException(COULD_NOT_SERIALIZE_OBJECT_MESSAGE);
                        }

                        solrInputDocument.addField(formatIndexName, byteArrayOS.toByteArray());
                    } else {
                        solrInputDocument.addField(formatIndexName, attributeValue);
                    }
                }
            }
        }

        //        if (!ConfigurationStore.getInstance().isDisableTextPath()) {
        if (StringUtils.isNotBlank(metacard.getMetadata())) {
            solrInputDocument.addField(LUX_XML_FIELD_NAME, metacard.getMetadata());
        }
        //        }

        /*
         * Lastly the metacardType must be added to the solr document. These are internal fields
         */
        solrInputDocument.addField(SchemaFields.METACARD_TYPE_FIELD_NAME, schema.getName());
        byte[] metacardTypeBytes = metacardTypeNameToSerialCache.get(schema.getName());

        if (metacardTypeBytes == null) {
            MetacardType coreMetacardType = new MetacardTypeImpl(schema.getName(),
                    schema.getAttributeDescriptors());
            metacardTypesCache.put(schema.getName(), coreMetacardType);

            metacardTypeBytes = serialize(coreMetacardType);
            metacardTypeNameToSerialCache.put(schema.getName(), metacardTypeBytes);

            addToFieldsCache(coreMetacardType.getAttributeDescriptors());
        }

        solrInputDocument.addField(SchemaFields.METACARD_TYPE_OBJECT_FIELD_NAME, metacardTypeBytes);

        solrInputDocument.addField(SchemaFields.CACHED_DATE, new Date());
    }

    /**
     * Returns the best approximation as to what {@link AttributeFormat} this Solr Field is.
     *
     * @param solrFieldName
     *            name of the Solr field
     * @return the {@link AttributeFormat} associated with the Solr field
     */
    public AttributeFormat getType(String solrFieldName) {

        String suffix = "";
        int lastIndexOfUndercore = solrFieldName.lastIndexOf(FIRST_CHAR_OF_SUFFIX);

        if (lastIndexOfUndercore != -1) {
            suffix = solrFieldName.substring(lastIndexOfUndercore, solrFieldName.length());
        }

        return schemaFields.getFormat(suffix);
    }

    public Serializable getDocValue(String solrFieldName, Object docValue) {

        AttributeFormat format = getType(solrFieldName);

        if (AttributeFormat.SHORT.equals(format)) {
            /*
             * We have inside knowledge that user-given short objects are stored as Integers in
             * Solr. You cannot cast an int to a short, so the workaround is to parse it. This
             * should not lead to any loss of information because the value was originally a short.
             */
            return Short.parseShort(docValue.toString());
        } else if (AttributeFormat.OBJECT.equals(format)) {

            ByteArrayInputStream bais = null;
            ObjectInputStream in = null;
            try {
                bais = new ByteArrayInputStream((byte[]) docValue);
                in = new ObjectInputStream(bais);
                return (Serializable) in.readObject();
            } catch (IOException e) {
                LOGGER.warn("IO exception loading input document", e);
            } catch (ClassNotFoundException e) {
                LOGGER.warn("Could not create object to return.", e);
                // TODO which exception to throw?
            } finally {
                IOUtils.closeQuietly(bais);
                IOUtils.closeQuietly(in);
            }

            return null;
        } else {
            return ((Serializable) docValue);
        }
    }

    /**
     * PRE-CONDITION is that fieldname cannot be null.
     *
     * The convention is that we add a suffix starting with an underscore, so if we find the last
     * underscore, then we can return the original field name.
     *
     * @param solrFieldName
     * @return the original field name
     */
    public String resolveFieldName(String solrFieldName) {

        int lastIndexOfUndercore = solrFieldName.lastIndexOf(FIRST_CHAR_OF_SUFFIX);

        if (lastIndexOfUndercore != -1) {
            return solrFieldName.substring(0, lastIndexOfUndercore);
        }
        return solrFieldName;
    }

    public boolean isPrivateField(String solrFieldName) {
        return PRIVATE_SOLR_FIELDS.contains(solrFieldName);
    }

    /**
     * Attempts to resolve the name of a field without being given an {@link AttributeFormat}
     *
     * @param field
     *            user given field name
     * @return a list of possible Solr field names that match the given field. If none are found,
     *         then an empty list is returned
     */
    public List<String> getAnonymousField(String field) {

        ArrayList<String> list = new ArrayList<String>();

        for (AttributeFormat format : AttributeFormat.values()) {

            String fullFieldName = field + schemaFields.getFieldSuffix(format);

            if (fieldsCache.contains(fullFieldName)) {
                list.add(fullFieldName);
            }

        }

        return list;
    }

    /**
     * Attempts to find the fieldName for the given propertyName value.
     *
     * @param propertyName
     *            property name provided by user
     * @param format
     *            {@link AttributeFormat} that describes the type of {@link Attribute} the field is
     * @param isSearchedAsExactValue
     *            specifies if any special index suffixes need to be added to the field
     * @return the proper schema field name. If a schema name cannot be found in cache, returns a
     *         schema field name that matches the dynamic field type formatting.
     */
    public String getField(String propertyName, AttributeFormat format, boolean isSearchedAsExactValue) {

        String fieldName = propertyName + schemaFields.getFieldSuffix(format)
                + (isSearchedAsExactValue ? "" : getSpecialIndexSuffix(format));

        if (fieldsCache.contains(fieldName)) {
            return fieldName;
        }

        switch (format) {
        case DOUBLE:
        case LONG:
        case INTEGER:
        case SHORT:
        case FLOAT:
            return findAnyMatchingNumericalField(propertyName);
        default:
            break;
        }

        LOGGER.info("Could not find exact schema field name for [{}], attempting to search with [{}]", propertyName,
                fieldName);

        return fieldName;
    }

    public String getFieldSuffix(AttributeFormat format) {
        return schemaFields.getFieldSuffix(format);
    }

    public String getMetacardUniqueId(SolrDocument doc) {
        return doc.getFieldValue(SchemaFields.METACARD_UNIQUE_ID_NAME).toString();
    }

    public String getMetacardId(SolrDocument doc) {
        return doc.getFieldValue(SchemaFields.METACARD_ID_NAME).toString();
    }

    public String getMetacardSource(SolrDocument doc) {
        return doc.getFieldValue(SchemaFields.METACARD_SOURCE_NAME).toString();
    }

    public MetacardType getMetacardType(SolrDocument doc) throws MetacardCreationException {
        String mTypeFieldName = doc.getFieldValue(SchemaFields.METACARD_TYPE_FIELD_NAME).toString();

        MetacardType cachedMetacardType = metacardTypesCache.get(mTypeFieldName);

        if (cachedMetacardType != null) {
            return cachedMetacardType;
        }

        byte[] bytes = (byte[]) doc.getFieldValue(SchemaFields.METACARD_TYPE_OBJECT_FIELD_NAME);

        ByteArrayInputStream bais = null;
        ObjectInputStream in = null;
        try {

            bais = new ByteArrayInputStream((byte[]) bytes);

            in = new ObjectInputStream(bais);

            cachedMetacardType = (MetacardType) in.readObject();

        } catch (IOException e) {

            LOGGER.warn("IO exception loading cached metacard type", e);

            throw new MetacardCreationException(COULD_NOT_READ_METACARD_TYPE_MESSAGE);
        } catch (ClassNotFoundException e) {
            LOGGER.warn("Class exception loading cached metacard type", e);

            throw new MetacardCreationException(COULD_NOT_READ_METACARD_TYPE_MESSAGE);
        } finally {
            IOUtils.closeQuietly(bais);
            IOUtils.closeQuietly(in);
        }

        metacardTypeNameToSerialCache.put(mTypeFieldName, bytes);
        metacardTypesCache.put(mTypeFieldName, cachedMetacardType);
        addToFieldsCache(cachedMetacardType.getAttributeDescriptors());
        return cachedMetacardType;
    }

    public String getCaseSensitiveField(String mappedPropertyName) {
        // TODO We can check if this field really does exist
        return mappedPropertyName + SchemaFields.HAS_CASE;
    }

    protected String getSpecialIndexSuffix(AttributeFormat format) {

        switch (format) {
        case STRING:
            return SchemaFields.TOKENIZED;
        case GEOMETRY:
            return SchemaFields.INDEXED;
        case XML:
            return SchemaFields.TEXT_PATH;
        default:
            break;
        }

        return "";
    }

    private void addToFieldsCache(Set<AttributeDescriptor> descriptors) {
        for (AttributeDescriptor ad : descriptors) {

            AttributeFormat format = ad.getType().getAttributeFormat();

            fieldsCache.add(ad.getName() + schemaFields.getFieldSuffix(format));

            if (!getSpecialIndexSuffix(format).equals("")) {
                fieldsCache.add(ad.getName() + schemaFields.getFieldSuffix(format) + getSpecialIndexSuffix(format));
            }

            if (format.equals(AttributeFormat.STRING)) {
                fieldsCache.add(ad.getName() + schemaFields.getFieldSuffix(format) + getSpecialIndexSuffix(format)
                        + SchemaFields.HAS_CASE);
            }

            if (format.equals(AttributeFormat.XML)) {
                fieldsCache.add(ad.getName() + SchemaFields.TEXT_SUFFIX + SchemaFields.TOKENIZED);
                fieldsCache.add(
                        ad.getName() + SchemaFields.TEXT_SUFFIX + SchemaFields.TOKENIZED + SchemaFields.HAS_CASE);
                fieldsCache.add(ad.getName() + schemaFields.getFieldSuffix(format) + getSpecialIndexSuffix(format));
            }
        }
    }

    private byte[] serialize(MetacardType anywhereMType) throws MetacardCreationException {

        ByteArrayOutputStream baos = new ByteArrayOutputStream();

        ObjectOutputStream out = null;

        try {
            out = new ObjectOutputStream(baos);
            out.writeObject(anywhereMType);

            byte[] bytes = baos.toByteArray();

            baos.close();
            out.close();

            return bytes;
        } catch (IOException e) {
            LOGGER.warn("IO exception reading metacard type message", e);
            throw new MetacardCreationException(COULD_NOT_READ_METACARD_TYPE_MESSAGE);
        }

    }

    private String findAnyMatchingNumericalField(String propertyName) {

        if (fieldsCache.contains(propertyName + SchemaFields.DOUBLE_SUFFIX)) {
            return propertyName + SchemaFields.DOUBLE_SUFFIX;
        }
        if (fieldsCache.contains(propertyName + SchemaFields.FLOAT_SUFFIX)) {
            return propertyName + SchemaFields.FLOAT_SUFFIX;
        }
        if (fieldsCache.contains(propertyName + SchemaFields.INTEGER_SUFFIX)) {
            return propertyName + SchemaFields.INTEGER_SUFFIX;
        }
        if (fieldsCache.contains(propertyName + SchemaFields.LONG_SUFFIX)) {
            return propertyName + SchemaFields.LONG_SUFFIX;
        }
        if (fieldsCache.contains(propertyName + SchemaFields.SHORT_SUFFIX)) {
            return propertyName + SchemaFields.SHORT_SUFFIX;
        }

        LOGGER.info("Did not find any numerical schema fields for property [{}]. Replacing with property [{}]",
                propertyName, propertyName + SchemaFields.INTEGER_SUFFIX);
        return propertyName + SchemaFields.INTEGER_SUFFIX;
    }

    /**
     * Given xml as a string, this method will parse out element text and CDATA text. It separates
     * each by one space character.
     * 
     * @param xmlData
     *            XML as a {@code String}
     * @return parsed CDATA and element text
     */
    protected String parseTextFrom(String xmlData) {

        StringBuilder builder = new StringBuilder();

        XMLStreamReader xmlStreamReader = null;
        StringReader sr = null;
        long starttime = System.currentTimeMillis();
        try {
            // xml parser does not handle leading whitespace
            sr = new StringReader(xmlData);
            xmlStreamReader = xmlInputFactory.createXMLStreamReader(sr);

            while (xmlStreamReader.hasNext()) {
                int event = xmlStreamReader.next();

                if (event == XMLStreamConstants.CHARACTERS || event == XMLStreamConstants.CDATA) {

                    String text = xmlStreamReader.getText();

                    if (StringUtils.isNotBlank(text)) {
                        builder.append(" " + text.trim());
                    }

                }
                if (event == XMLStreamConstants.START_ELEMENT) {
                    for (int i = 0; i < xmlStreamReader.getAttributeCount(); i++) {

                        String text = xmlStreamReader.getAttributeValue(i);

                        if (StringUtils.isNotBlank(text)) {
                            builder.append(" " + text.trim());
                        }
                    }
                }
            }
        } catch (XMLStreamException e1) {
            LOGGER.warn("Failure occurred in parsing the xml data. No data has been stored or indexed.", e1);
        } finally {
            IOUtils.closeQuietly(sr);
            if (xmlStreamReader != null) {
                try {
                    xmlStreamReader.close();
                } catch (XMLStreamException e) {
                    LOGGER.debug("Exception closing XMLStreamReader", e);
                }
            }
        }
        long endTime = System.currentTimeMillis();

        LOGGER.debug("Parsing took {} ms", endTime - starttime);

        return builder.toString();
    }
}