org.apache.solr.handler.dataimport.XPathEntityProcessor.java Source code

Introduction

Here is the source code for org.apache.solr.handler.dataimport.XPathEntityProcessor.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.handler.dataimport;

import static org.apache.solr.handler.dataimport.DataImportHandlerException.SEVERE;
import static org.apache.solr.handler.dataimport.DataImportHandlerException.wrapAndThrow;
import org.apache.solr.core.SolrCore;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.solr.util.SystemIdResolver;
import org.apache.solr.common.util.XMLErrorLogger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.commons.io.IOUtils;

import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import java.io.CharArrayReader;
import java.io.CharArrayWriter;
import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.*;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;

/**
 * <p> An implementation of {@link EntityProcessor} which uses a streaming xpath parser to extract values out of XML documents.
 * It is typically used in conjunction with {@link URLDataSource} or {@link FileDataSource}. </p> <p> Refer to <a
 * href="http://wiki.apache.org/solr/DataImportHandler">http://wiki.apache.org/solr/DataImportHandler</a> for more
 * details. </p>
 * <p>
 * <b>This API is experimental and may change in the future.</b>
 *
 *
 * @see XPathRecordReader
 * @since solr 1.3
 */
public class XPathEntityProcessor extends EntityProcessorBase {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    private static final XMLErrorLogger xmllog = new XMLErrorLogger(LOG);

    private static final Map<String, Object> END_MARKER = new HashMap<>();

    protected List<String> placeHolderVariables;

    protected List<String> commonFields;

    private String pk;

    private XPathRecordReader xpathReader;

    protected DataSource<Reader> dataSource;

    protected javax.xml.transform.Transformer xslTransformer;

    protected boolean useSolrAddXml = false;

    protected boolean streamRows = false;

    // Amount of time to block reading/writing to queue when streaming
    protected int blockingQueueTimeOut = 10;

    // Units for pumpTimeOut
    protected TimeUnit blockingQueueTimeOutUnits = TimeUnit.SECONDS;

    // Number of rows to queue for asynchronous processing
    protected int blockingQueueSize = 1000;

    protected Thread publisherThread;

    protected boolean reinitXPathReader = true;

    @Override
    @SuppressWarnings("unchecked")
    public void init(Context context) {
        super.init(context);
        if (reinitXPathReader)
            initXpathReader(context.getVariableResolver());
        pk = context.getEntityAttribute("pk");
        dataSource = context.getDataSource();
        rowIterator = null;

    }

    private void initXpathReader(VariableResolver resolver) {
        reinitXPathReader = false;
        useSolrAddXml = Boolean.parseBoolean(context.getEntityAttribute(USE_SOLR_ADD_SCHEMA));
        streamRows = Boolean.parseBoolean(context.getEntityAttribute(STREAM));
        if (context.getResolvedEntityAttribute("batchSize") != null) {
            blockingQueueSize = Integer.parseInt(context.getEntityAttribute("batchSize"));
        }
        if (context.getResolvedEntityAttribute("readTimeOut") != null) {
            blockingQueueTimeOut = Integer.parseInt(context.getEntityAttribute("readTimeOut"));
        }
        String xslt = context.getEntityAttribute(XSL);
        if (xslt != null) {
            xslt = context.replaceTokens(xslt);
            try {
                // create an instance of TransformerFactory
                TransformerFactory transFact = TransformerFactory.newInstance();
                final SolrCore core = context.getSolrCore();
                final StreamSource xsltSource;
                if (core != null) {
                    final ResourceLoader loader = core.getResourceLoader();
                    transFact.setURIResolver(new SystemIdResolver(loader).asURIResolver());
                    xsltSource = new StreamSource(loader.openResource(xslt),
                            SystemIdResolver.createSystemIdFromResourceName(xslt));
                } else {
                    // fallback for tests
                    xsltSource = new StreamSource(xslt);
                }
                transFact.setErrorListener(xmllog);
                try {
                    xslTransformer = transFact.newTransformer(xsltSource);
                } finally {
                    // some XML parsers are broken and don't close the byte stream (but they should according to spec)
                    IOUtils.closeQuietly(xsltSource.getInputStream());
                }
                LOG.info("Using xslTransformer: " + xslTransformer.getClass().getName());
            } catch (Exception e) {
                throw new DataImportHandlerException(SEVERE, "Error initializing XSL ", e);
            }
        }

        if (useSolrAddXml) {
            // Support solr add documents
            xpathReader = new XPathRecordReader("/add/doc");
            xpathReader.addField("name", "/add/doc/field/@name", true);
            xpathReader.addField("value", "/add/doc/field", true);
        } else {
            String forEachXpath = context.getResolvedEntityAttribute(FOR_EACH);
            if (forEachXpath == null)
                throw new DataImportHandlerException(SEVERE,
                        "Entity : " + context.getEntityAttribute("name") + " must have a 'forEach' attribute");
            if (forEachXpath.equals(context.getEntityAttribute(FOR_EACH)))
                reinitXPathReader = true;

            try {
                xpathReader = new XPathRecordReader(forEachXpath);
                for (Map<String, String> field : context.getAllEntityFields()) {
                    if (field.get(XPATH) == null)
                        continue;
                    int flags = 0;
                    if ("true".equals(field.get("flatten"))) {
                        flags = XPathRecordReader.FLATTEN;
                    }
                    String xpath = field.get(XPATH);
                    xpath = context.replaceTokens(xpath);
                    //!xpath.equals(field.get(XPATH) means the field xpath has a template
                    //in that case ensure that the XPathRecordReader is reinitialized
                    //for each xml
                    if (!xpath.equals(field.get(XPATH)) && !context.isRootEntity())
                        reinitXPathReader = true;
                    xpathReader.addField(field.get(DataImporter.COLUMN), xpath,
                            Boolean.parseBoolean(field.get(DataImporter.MULTI_VALUED)), flags);
                }
            } catch (RuntimeException e) {
                throw new DataImportHandlerException(SEVERE, "Exception while reading xpaths for fields", e);
            }
        }
        String url = context.getEntityAttribute(URL);
        List<String> l = url == null ? Collections.EMPTY_LIST : resolver.getVariables(url);
        for (String s : l) {
            if (s.startsWith(entityName + ".")) {
                if (placeHolderVariables == null)
                    placeHolderVariables = new ArrayList<>();
                placeHolderVariables.add(s.substring(entityName.length() + 1));
            }
        }
        for (Map<String, String> fld : context.getAllEntityFields()) {
            if (fld.get(COMMON_FIELD) != null && "true".equals(fld.get(COMMON_FIELD))) {
                if (commonFields == null)
                    commonFields = new ArrayList<>();
                commonFields.add(fld.get(DataImporter.COLUMN));
            }
        }

    }

    @Override
    public Map<String, Object> nextRow() {
        Map<String, Object> result;

        if (!context.isRootEntity())
            return fetchNextRow();

        while (true) {
            result = fetchNextRow();

            if (result == null)
                return null;

            if (pk == null || result.get(pk) != null)
                return result;
        }
    }

    @Override
    public void postTransform(Map<String, Object> r) {
        readUsefulVars(r);
    }

    @SuppressWarnings("unchecked")
    private Map<String, Object> fetchNextRow() {
        Map<String, Object> r = null;
        while (true) {
            if (rowIterator == null)
                initQuery(context.replaceTokens(context.getEntityAttribute(URL)));
            r = getNext();
            if (r == null) {
                Object hasMore = context.getSessionAttribute(HAS_MORE, Context.SCOPE_ENTITY);
                try {
                    if ("true".equals(hasMore) || Boolean.TRUE.equals(hasMore)) {
                        String url = (String) context.getSessionAttribute(NEXT_URL, Context.SCOPE_ENTITY);
                        if (url == null)
                            url = context.getEntityAttribute(URL);
                        addNamespace();
                        initQuery(context.replaceTokens(url));
                        r = getNext();
                        if (r == null)
                            return null;
                    } else {
                        return null;
                    }
                } finally {
                    context.setSessionAttribute(HAS_MORE, null, Context.SCOPE_ENTITY);
                    context.setSessionAttribute(NEXT_URL, null, Context.SCOPE_ENTITY);
                }
            }
            addCommonFields(r);
            return r;
        }
    }

    private void addNamespace() {
        Map<String, Object> namespace = new HashMap<>();
        Set<String> allNames = new HashSet<>();
        if (commonFields != null)
            allNames.addAll(commonFields);
        if (placeHolderVariables != null)
            allNames.addAll(placeHolderVariables);
        if (allNames.isEmpty())
            return;

        for (String name : allNames) {
            Object val = context.getSessionAttribute(name, Context.SCOPE_ENTITY);
            if (val != null)
                namespace.put(name, val);
        }
        ((VariableResolver) context.getVariableResolver()).addNamespace(entityName, namespace);
    }

    private void addCommonFields(Map<String, Object> r) {
        if (commonFields != null) {
            for (String commonField : commonFields) {
                if (r.get(commonField) == null) {
                    Object val = context.getSessionAttribute(commonField, Context.SCOPE_ENTITY);
                    if (val != null)
                        r.put(commonField, val);
                }

            }
        }

    }

    private void initQuery(String s) {
        Reader data = null;
        try {
            final List<Map<String, Object>> rows = new ArrayList<>();
            try {
                data = dataSource.getData(s);
            } catch (Exception e) {
                if (ABORT.equals(onError)) {
                    wrapAndThrow(SEVERE, e);
                } else if (SKIP.equals(onError)) {
                    if (LOG.isDebugEnabled())
                        LOG.debug("Skipping url : " + s, e);
                    wrapAndThrow(DataImportHandlerException.SKIP, e);
                } else {
                    LOG.warn("Failed for url : " + s, e);
                    rowIterator = Collections.EMPTY_LIST.iterator();
                    return;
                }
            }
            if (xslTransformer != null) {
                try {
                    SimpleCharArrayReader caw = new SimpleCharArrayReader();
                    xslTransformer.transform(new StreamSource(data), new StreamResult(caw));
                    data = caw.getReader();
                } catch (TransformerException e) {
                    if (ABORT.equals(onError)) {
                        wrapAndThrow(SEVERE, e, "Exception in applying XSL Transformation");
                    } else if (SKIP.equals(onError)) {
                        wrapAndThrow(DataImportHandlerException.SKIP, e);
                    } else {
                        LOG.warn("Failed for url : " + s, e);
                        rowIterator = Collections.EMPTY_LIST.iterator();
                        return;
                    }
                }
            }
            if (streamRows) {
                rowIterator = getRowIterator(data, s);
            } else {
                try {
                    xpathReader.streamRecords(data, (record, xpath) -> rows.add(readRow(record, xpath)));
                } catch (Exception e) {
                    String msg = "Parsing failed for xml, url:" + s + " rows processed:" + rows.size();
                    if (rows.size() > 0)
                        msg += " last row: " + rows.get(rows.size() - 1);
                    if (ABORT.equals(onError)) {
                        wrapAndThrow(SEVERE, e, msg);
                    } else if (SKIP.equals(onError)) {
                        LOG.warn(msg, e);
                        Map<String, Object> map = new HashMap<>();
                        map.put(DocBuilder.SKIP_DOC, Boolean.TRUE);
                        rows.add(map);
                    } else if (CONTINUE.equals(onError)) {
                        LOG.warn(msg, e);
                    }
                }
                rowIterator = rows.iterator();
            }
        } finally {
            if (!streamRows) {
                closeIt(data);
            }

        }
    }

    private void closeIt(Reader data) {
        try {
            data.close();
        } catch (Exception e) { /* Ignore */
        }
    }

    protected Map<String, Object> readRow(Map<String, Object> record, String xpath) {
        if (useSolrAddXml) {
            List<String> names = (List<String>) record.get("name");
            List<String> values = (List<String>) record.get("value");
            Map<String, Object> row = new HashMap<>();
            for (int i = 0; i < names.size() && i < values.size(); i++) {
                if (row.containsKey(names.get(i))) {
                    Object existing = row.get(names.get(i));
                    if (existing instanceof List) {
                        List list = (List) existing;
                        list.add(values.get(i));
                    } else {
                        List list = new ArrayList();
                        list.add(existing);
                        list.add(values.get(i));
                        row.put(names.get(i), list);
                    }
                } else {
                    row.put(names.get(i), values.get(i));
                }
            }
            return row;
        } else {
            record.put(XPATH_FIELD_NAME, xpath);
            return record;
        }
    }

    private static class SimpleCharArrayReader extends CharArrayWriter {
        public Reader getReader() {
            return new CharArrayReader(super.buf, 0, super.count);
        }

    }

    @SuppressWarnings("unchecked")
    private Map<String, Object> readUsefulVars(Map<String, Object> r) {
        Object val = r.get(HAS_MORE);
        if (val != null)
            context.setSessionAttribute(HAS_MORE, val, Context.SCOPE_ENTITY);
        val = r.get(NEXT_URL);
        if (val != null)
            context.setSessionAttribute(NEXT_URL, val, Context.SCOPE_ENTITY);
        if (placeHolderVariables != null) {
            for (String s : placeHolderVariables) {
                val = r.get(s);
                context.setSessionAttribute(s, val, Context.SCOPE_ENTITY);
            }
        }
        if (commonFields != null) {
            for (String s : commonFields) {
                Object commonVal = r.get(s);
                if (commonVal != null) {
                    context.setSessionAttribute(s, commonVal, Context.SCOPE_ENTITY);
                }
            }
        }
        return r;

    }

    private Iterator<Map<String, Object>> getRowIterator(final Reader data, final String s) {
        //nothing atomic about it. I just needed a StongReference
        final AtomicReference<Exception> exp = new AtomicReference<>();
        final BlockingQueue<Map<String, Object>> blockingQueue = new ArrayBlockingQueue<>(blockingQueueSize);
        final AtomicBoolean isEnd = new AtomicBoolean(false);
        final AtomicBoolean throwExp = new AtomicBoolean(true);
        publisherThread = new Thread() {
            @Override
            public void run() {
                try {
                    xpathReader.streamRecords(data, (record, xpath) -> {
                        if (isEnd.get()) {
                            throwExp.set(false);
                            //To end the streaming . otherwise the parsing will go on forever
                            //though consumer has gone away
                            throw new RuntimeException("BREAK");
                        }
                        Map<String, Object> row;
                        try {
                            row = readRow(record, xpath);
                        } catch (Exception e) {
                            isEnd.set(true);
                            return;
                        }
                        offer(row);
                    });
                } catch (Exception e) {
                    if (throwExp.get())
                        exp.set(e);
                } finally {
                    closeIt(data);
                    if (!isEnd.get()) {
                        offer(END_MARKER);
                    }
                }
            }

            private void offer(Map<String, Object> row) {
                try {
                    while (!blockingQueue.offer(row, blockingQueueTimeOut, blockingQueueTimeOutUnits)) {
                        if (isEnd.get())
                            return;
                        LOG.debug("Timeout elapsed writing records.  Perhaps buffer size should be increased.");
                    }
                } catch (InterruptedException e) {
                    return;
                } finally {
                    synchronized (this) {
                        notifyAll();
                    }
                }
            }
        };

        publisherThread.start();

        return new Iterator<Map<String, Object>>() {
            private Map<String, Object> lastRow;
            int count = 0;

            @Override
            public boolean hasNext() {
                return !isEnd.get();
            }

            @Override
            public Map<String, Object> next() {
                Map<String, Object> row;

                do {
                    try {
                        row = blockingQueue.poll(blockingQueueTimeOut, blockingQueueTimeOutUnits);
                        if (row == null) {
                            LOG.debug("Timeout elapsed reading records.");
                        }
                    } catch (InterruptedException e) {
                        LOG.debug("Caught InterruptedException while waiting for row.  Aborting.");
                        isEnd.set(true);
                        return null;
                    }
                } while (row == null);

                if (row == END_MARKER) {
                    isEnd.set(true);
                    if (exp.get() != null) {
                        String msg = "Parsing failed for xml, url:" + s + " rows processed in this xml:" + count;
                        if (lastRow != null)
                            msg += " last row in this xml:" + lastRow;
                        if (ABORT.equals(onError)) {
                            wrapAndThrow(SEVERE, exp.get(), msg);
                        } else if (SKIP.equals(onError)) {
                            wrapAndThrow(DataImportHandlerException.SKIP, exp.get());
                        } else {
                            LOG.warn(msg, exp.get());
                        }
                    }
                    return null;
                }
                count++;
                return lastRow = row;
            }

            @Override
            public void remove() {
                /*no op*/
            }
        };

    }

    public static final String URL = "url";

    public static final String HAS_MORE = "$hasMore";

    public static final String NEXT_URL = "$nextUrl";

    public static final String XPATH_FIELD_NAME = "$forEach";

    public static final String FOR_EACH = "forEach";

    public static final String XPATH = "xpath";

    public static final String COMMON_FIELD = "commonField";

    public static final String USE_SOLR_ADD_SCHEMA = "useSolrAddSchema";

    public static final String XSL = "xsl";

    public static final String STREAM = "stream";

}