com.marklogic.mapreduce.ContentOutputFormat.java Source code

Java tutorial

Introduction

Here is the source code for com.marklogic.mapreduce.ContentOutputFormat.java

Source

/*
 * Copyright 2003-2016 MarkLogic Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.marklogic.mapreduce;

import java.io.IOException;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.DefaultStringifier;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import com.marklogic.mapreduce.utilities.AssignmentManager;
import com.marklogic.mapreduce.utilities.AssignmentPolicy;
import com.marklogic.mapreduce.utilities.ForestInfo;
import com.marklogic.mapreduce.utilities.InternalUtilities;
import com.marklogic.mapreduce.utilities.TextArrayWritable;
import com.marklogic.xcc.AdhocQuery;
import com.marklogic.xcc.ContentCapability;
import com.marklogic.xcc.ContentSource;
import com.marklogic.xcc.RequestOptions;
import com.marklogic.xcc.ResultItem;
import com.marklogic.xcc.ResultSequence;
import com.marklogic.xcc.Session;
import com.marklogic.xcc.exceptions.RequestException;
import com.marklogic.xcc.exceptions.XccConfigException;
import com.marklogic.xcc.types.XSBoolean;

/**
 * MarkLogicOutputFormat for Content.
 * 
 * <p>
 *  Use this class to store results as content in a MarkLogic Server database.
 *  The text, XML, or binary content is inserted into the database at the
 *  given {@link DocumentURI}.
 * </p>
 * <p>
 *  When using this {@link MarkLogicOutputFormat}, your key should be the URI of
 *  the document to insert into the database. The value should be the content to
 *  insert, in the form of {@link org.apache.hadoop.io.Text} or 
 *  {@link MarkLogicNode}.
 * </p>
 * <p>
 *  Several configuration properties exist for controlling the content insertion,
 *  including permissions, collections, quality, directory, and content type.
 * </p>
 * 
 * @see MarkLogicConstants
 * @see com.marklogic.mapreduce.examples.ContentLoader
 * @see com.marklogic.mapreduce.examples.ZipContentLoader
 * @author jchen
 *
 * @param <VALUEOUT>
 */
public class ContentOutputFormat<VALUEOUT> extends MarkLogicOutputFormat<DocumentURI, VALUEOUT> {
    public static final Log LOG = LogFactory.getLog(ContentOutputFormat.class);

    // Prepend to a forest id to form a database name parsed by XDBC.
    // Also used here alone as the forest id placeholder in non-fast-mode.
    public static final String ID_PREFIX = "#";

    static final String FOREST_HOST_MAP_QUERY = "import module namespace hadoop = "
            + "\"http://marklogic.com/xdmp/hadoop\" at \"/MarkLogic/hadoop.xqy\";\n"
            + "hadoop:get-forest-host-map()";
    public static final String FOREST_HOST_QUERY = "import module namespace hadoop = "
            + "\"http://marklogic.com/xdmp/hadoop\" at \"/MarkLogic/hadoop.xqy\";\n"
            + "declare variable $policy as xs:string external;\n"
            + "declare variable $partition-name as xs:string external;\n"
            + "hadoop:get-forest-host($policy,$partition-name)";
    public static final String INIT_QUERY = "import module namespace hadoop = "
            + "\"http://marklogic.com/xdmp/hadoop\" at \"/MarkLogic/hadoop.xqy\";\n"
            + "xdmp:host-name(xdmp:host()), \n" + "let $f := "
            + "  fn:function-lookup(xs:QName('hadoop:get-assignment-policy'),0)\n"
            + "return if (exists($f)) then $f() else ()";

    protected AssignmentManager am = AssignmentManager.getInstance();
    protected boolean fastLoad;
    /** whether stats-based policy allows fastload **/
    protected boolean allowFastLoad = true;
    protected AssignmentPolicy.Kind policy;
    protected boolean legacy = false;
    protected String initHostName;

    @Override
    public void checkOutputSpecs(Configuration conf, ContentSource cs) throws IOException {
        Session session = null;
        ResultSequence result = null;
        try {
            session = cs.newSession();
            RequestOptions options = new RequestOptions();
            options.setDefaultXQueryVersion("1.0-ml");
            session.setDefaultRequestOptions(options);

            // clear output dir if specified
            String outputDir = conf.get(OUTPUT_DIRECTORY);
            if (outputDir != null) {
                outputDir = outputDir.endsWith("/") ? outputDir : outputDir + "/";
                if (conf.getBoolean(OUTPUT_CLEAN_DIR, false)) {
                    // delete directory if exists
                    String queryText = DELETE_DIRECTORY_TEMPLATE.replace(DIRECTORY_TEMPLATE, outputDir);
                    AdhocQuery query = session.newAdhocQuery(queryText);
                    result = session.submitRequest(query);
                } else { // ensure nothing exists under output dir
                    String queryText = CHECK_DIRECTORY_EXIST_TEMPLATE.replace(DIRECTORY_TEMPLATE, outputDir);
                    AdhocQuery query = session.newAdhocQuery(queryText);
                    result = session.submitRequest(query);
                    if (result.hasNext()) {
                        ResultItem item = result.next();
                        if (((XSBoolean) (item.getItem())).asBoolean()) {
                            throw new IllegalStateException("Directory " + outputDir + " already exists");
                        }
                    } else {
                        throw new IllegalStateException("Failed to query directory content.");
                    }
                }
            }
            // initialize server host name and assignment policy
            initialize(session);

            // ensure manual directory creation 
            if (fastLoad) {
                LOG.info("Running in fast load mode");
                // store forest-info map into config system
                DefaultStringifier.store(conf, queryForestInfo(cs), OUTPUT_FOREST_HOST);

                AdhocQuery query = session.newAdhocQuery(DIRECTORY_CREATE_QUERY);
                result = session.submitRequest(query);
                if (result.hasNext()) {
                    ResultItem item = result.next();
                    String dirMode = item.asString();
                    if (!dirMode.equals(MANUAL_DIRECTORY_MODE)) {
                        throw new IllegalStateException("Manual directory creation mode is required. "
                                + "The current creation mode is " + dirMode + ".");
                    }
                } else {
                    throw new IllegalStateException("Failed to query directory creation mode.");
                }
            } else {
                TextArrayWritable hostArray;
                // 23798: replace hostname in forest config with 
                // user-specified output host
                String outputHost = conf.get(OUTPUT_HOST);
                if (MODE_LOCAL.equals(conf.get(EXECUTION_MODE))) {
                    hostArray = queryHosts(cs, initHostName, outputHost);
                } else {
                    hostArray = queryHosts(cs);
                }
                DefaultStringifier.store(conf, hostArray, OUTPUT_FOREST_HOST);
            }

            // validate capabilities
            String[] perms = conf.getStrings(OUTPUT_PERMISSION);
            if (perms != null && perms.length > 0) {
                if (perms.length % 2 != 0) {
                    throw new IllegalStateException("Permissions are expected to be in <role, capability> pairs.");
                }
                int i = 0;
                while (i + 1 < perms.length) {
                    String roleName = perms[i++];
                    if (roleName == null || roleName.isEmpty()) {
                        throw new IllegalStateException("Illegal role name: " + roleName);
                    }
                    String perm = perms[i].trim();
                    if (!perm.equalsIgnoreCase(ContentCapability.READ.toString())
                            && !perm.equalsIgnoreCase(ContentCapability.EXECUTE.toString())
                            && !perm.equalsIgnoreCase(ContentCapability.INSERT.toString())
                            && !perm.equalsIgnoreCase(ContentCapability.UPDATE.toString())) {
                        throw new IllegalStateException("Illegal capability: " + perm);
                    }
                    i++;
                }
            }
        } catch (RequestException ex) {
            throw new IOException(ex);
        } finally {
            if (session != null) {
                session.close();
            }
            if (result != null) {
                result.close();
            }
        }
    }

    protected Map<String, ContentSource> getSourceMap(boolean fastLoad, TaskAttemptContext context)
            throws IOException {
        Configuration conf = context.getConfiguration();
        Map<String, ContentSource> sourceMap = new LinkedHashMap<String, ContentSource>();
        if (fastLoad) {
            LinkedMapWritable forestStatusMap = getForestStatusMap(conf);
            // get host->contentSource mapping
            Map<String, ContentSource> hostSourceMap = new HashMap<String, ContentSource>();
            for (Writable v : forestStatusMap.values()) {
                ForestInfo fs = (ForestInfo) v;
                //unupdatable forests
                if (fs.getUpdatable() == false)
                    continue;
                if (hostSourceMap.get(fs.getHostName()) == null) {
                    try {
                        ContentSource cs = InternalUtilities.getOutputContentSource(conf,
                                fs.getHostName().toString());
                        hostSourceMap.put(fs.getHostName(), cs);
                    } catch (XccConfigException e) {
                        throw new IOException(e);
                    }
                }
            }

            // consolidate forest->host map and host-contentSource map to 
            // forest-contentSource map
            for (Writable forestId : forestStatusMap.keySet()) {
                String forest = ((Text) forestId).toString();
                String hostName = ((ForestInfo) forestStatusMap.get(forestId)).getHostName();
                ContentSource cs = hostSourceMap.get(hostName);
                sourceMap.put(ID_PREFIX + forest, cs);
            }
        } else {
            TextArrayWritable hosts = getHosts(conf);
            for (Writable host : hosts.get()) {
                String hostStr = host.toString();
                try {
                    ContentSource cs = InternalUtilities.getOutputContentSource(conf, hostStr);
                    sourceMap.put(hostStr, cs);
                } catch (XccConfigException e) {
                    throw new IOException(e);
                }
            }
        }
        return sourceMap;
    }

    @Override
    public RecordWriter<DocumentURI, VALUEOUT> getRecordWriter(TaskAttemptContext context)
            throws IOException, InterruptedException {
        Configuration conf = context.getConfiguration();
        // TODO: if MAPREDUCE-3377 still exists, need to re-run initialize
        fastLoad = Boolean.valueOf(conf.get(OUTPUT_FAST_LOAD));
        Map<String, ContentSource> sourceMap = getSourceMap(fastLoad, context);
        // construct the ContentWriter
        return new ContentWriter<VALUEOUT>(conf, sourceMap, fastLoad, am);
    }

    // forest host map is saved when checkOutputSpecs() is called.  In certain 
    // versions of Hadoop, the config is not persisted as part of the job hence
    // will be lost.  See MAPREDUCE-3377 for details.  When this entry cannot
    // be found from the config, re-query the database to get this info.  It is
    // possible that each task gets a different version of the map if the 
    // forest config changes while the job runs.
    protected LinkedMapWritable getForestStatusMap(Configuration conf) throws IOException {
        String forestHost = conf.get(OUTPUT_FOREST_HOST);
        if (forestHost != null) {
            //Restores the object from the configuration.
            LinkedMapWritable fhmap = DefaultStringifier.load(conf, OUTPUT_FOREST_HOST, LinkedMapWritable.class);
            // must be in fast load mode, otherwise won't reach here
            String s = conf.get(ASSIGNMENT_POLICY);
            //EXECUTION_MODE must have a value in mlcp;
            //default is "distributed" in hadoop connector
            String mode = conf.get(EXECUTION_MODE, MODE_DISTRIBUTED);
            if (MODE_DISTRIBUTED.equals(mode)) {
                AssignmentPolicy.Kind policy = AssignmentPolicy.Kind.forName(s);
                am.initialize(policy, fhmap, conf.getInt(BATCH_SIZE, 10));
            }
            return fhmap;
        } else {
            try {
                // try getting a connection
                ContentSource cs = InternalUtilities.getOutputContentSource(conf, conf.get(OUTPUT_HOST));
                //get policy
                initialize(cs.newSession());
                // query forest status mapping
                return queryForestInfo(cs);
            } catch (Exception ex) {
                throw new IOException(ex);
            }
        }
    }

    /**
     * Initialize initial server host name, assignment policy and fastload.
     * 
     * @param session
     * @throws IOException
     * @throws RequestException
     */
    protected void initialize(Session session) throws IOException, RequestException {
        AdhocQuery query = session.newAdhocQuery(INIT_QUERY);
        RequestOptions options = new RequestOptions();
        options.setDefaultXQueryVersion("1.0-ml");
        query.setOptions(options);
        ResultSequence result = null;
        result = session.submitRequest(query);

        ResultItem item = result.next();
        initHostName = item.asString();
        if (result.hasNext()) {
            item = result.next();
            String policyStr = item.asString();
            conf.set(ASSIGNMENT_POLICY, policyStr);
            policy = AssignmentPolicy.Kind.forName(policyStr);
            item = result.next();
            allowFastLoad = Boolean.parseBoolean(item.asString());
            if ((policy == AssignmentPolicy.Kind.STATISTICAL || policy == AssignmentPolicy.Kind.RANGE)
                    && !allowFastLoad && conf.getBoolean(OUTPUT_FAST_LOAD, false)) {
                throw new IOException("Fastload can't be used: rebalancer is on and "
                        + "forests are imbalanced in a database with " + "statistics-based assignment policy");
            }
        } else {
            policy = AssignmentPolicy.Kind.LEGACY;
            legacy = true;
        }

        // initialize fastload mode
        if (conf.get(OUTPUT_FAST_LOAD) == null) {
            // fastload not set
            if (conf.get(OUTPUT_DIRECTORY) != null) {
                // output_dir is set, attempt to do fastload
                if (conf.get(OUTPUT_PARTITION) == null && policy == AssignmentPolicy.Kind.RANGE) {
                    fastLoad = false;
                } else if (policy == AssignmentPolicy.Kind.RANGE || policy == AssignmentPolicy.Kind.STATISTICAL) {
                    fastLoad = allowFastLoad;
                } else {
                    fastLoad = true;
                }
            } else {
                //neither fastload nor output_dir is set
                fastLoad = false;
            }
        } else {
            fastLoad = conf.getBoolean(OUTPUT_FAST_LOAD, false);
            if (fastLoad && conf.get(OUTPUT_PARTITION) == null && policy == AssignmentPolicy.Kind.RANGE) {
                throw new IllegalArgumentException("output_partition is required for fastload mode.");
            }
        }
        conf.setBoolean(OUTPUT_FAST_LOAD, fastLoad);
    }

    /**
     * must be attempting or doing fastload when this method is called.
     * result format of the query varies based on policy
     * 
     * bucket:(fid, host, updateAllow)*
     * range:(fid, host, fragmentCount)*
     * statistical: (fid, host, fragmentCount)*
     * legacy: (fid, host)*
     * 
     * @param cs
     * @return a forest-info map
     * @throws IOException
     */
    protected LinkedMapWritable queryForestInfo(ContentSource cs) throws IOException {
        Session session = null;
        ResultSequence result = null;
        try {
            session = cs.newSession();
            AdhocQuery query = null;
            if (legacy) {
                LOG.debug("Legacy assignment is assumed for older MarkLogic" + " Server.");
                query = session.newAdhocQuery(FOREST_HOST_MAP_QUERY);
            } else {
                query = session.newAdhocQuery(FOREST_HOST_QUERY);
                if (policy == AssignmentPolicy.Kind.RANGE) {
                    String pName = conf.get(OUTPUT_PARTITION);
                    query.setNewStringVariable("partition-name", pName);
                } else {
                    query.setNewStringVariable("partition-name", "");
                }
                query.setNewStringVariable("policy", policy.toString().toLowerCase());
            }

            // query forest status mapping                 
            RequestOptions options = new RequestOptions();
            options.setDefaultXQueryVersion("1.0-ml");
            query.setOptions(options);
            if (LOG.isDebugEnabled()) {
                LOG.debug(query.toString());
            }
            result = session.submitRequest(query);

            LinkedMapWritable forestStatusMap = new LinkedMapWritable();
            Text forest = null;
            String outputHost = conf.get(OUTPUT_HOST);
            boolean local = MODE_LOCAL.equals(conf.get(EXECUTION_MODE));

            while (result.hasNext()) {
                ResultItem item = result.next();
                if (forest == null) {
                    forest = new Text(item.asString());
                } else {
                    String hostName = item.asString();
                    // 23798: replace hostname in forest config with 
                    // user-specified output host
                    if (local && hostName != null && hostName.equals(initHostName)) {
                        hostName = outputHost;
                    }
                    if (!legacy) {
                        if (policy == AssignmentPolicy.Kind.BUCKET) {
                            item = result.next();
                            boolean updatable = Boolean.parseBoolean(item.asString());
                            forestStatusMap.put(forest, new ForestInfo(hostName, -1, updatable));
                        } else if (policy == AssignmentPolicy.Kind.LEGACY) {
                            forestStatusMap.put(forest, new ForestInfo(hostName, -1, true));
                        } else {
                            // range or statistical
                            item = result.next();
                            long dc = Long.parseLong(item.asString());
                            forestStatusMap.put(forest, new ForestInfo(hostName, dc, true));
                        }
                    } else {
                        forestStatusMap.put(forest, new ForestInfo(hostName, -1, true));
                    }
                    forest = null;
                }
            }
            if (forestStatusMap.size() == 0) {
                throw new IOException("Number of forests is 0: " + "check forests in database");
            }
            am.initialize(policy, forestStatusMap, conf.getInt(BATCH_SIZE, 10));
            return forestStatusMap;
        } catch (RequestException e) {
            LOG.error(e.getMessage(), e);
            throw new IOException(e);
        } finally {
            if (result != null) {
                result.close();
            }
            if (session != null) {
                session.close();
            }
        }
    }
}