org.springframework.data.hadoop.pig.PigServerFactoryBean.java Source code

Java tutorial

Introduction

Here is the source code for org.springframework.data.hadoop.pig.PigServerFactoryBean.java

Source

/*
 * Copyright 2011 the original author or authors.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.springframework.data.hadoop.pig;

import java.security.PrivilegedExceptionAction;
import java.util.Collection;

import org.apache.hadoop.security.UserGroupInformation;
import org.apache.pig.PigServer;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.impl.PigContext;
import org.springframework.beans.factory.BeanCreationException;
import org.springframework.beans.factory.BeanNameAware;
import org.springframework.beans.factory.FactoryBean;
import org.springframework.beans.factory.ObjectFactory;
import org.springframework.util.CollectionUtils;
import org.springframework.util.StringUtils;

/**
 * Factory for creating a {@link PigServer} instance. Note that since PigServer is not thread-safe and the Pig API does not
 * provide some type of factory, the factory bean returns an instance of {@link ObjectFactory} (which handles the creation of {@link PigServer} instances) 
 * instead of the raw {@link PigServer} object which cannot be reused. 
 * 
 * Note that the caller needs to handle the object clean-up,  specifically calling {@link PigServer#shutdown()}. 
 * 
 * In general, to avoid leaks it is recommended to use the {@link PigTemplate}.
 * 
 * @author Costin Leau
 */
public class PigServerFactoryBean implements FactoryBean<PigServerFactory>, BeanNameAware {

    private PigContext pigContext;
    private Collection<String> pathToSkip;
    private Collection<PigScript> scripts;
    private Integer parallelism;
    private String jobName;
    private String jobPriority;
    private Boolean validateEachStatement;
    private String beanName;

    private String user;

    private class DefaultPigServerFactory implements PigServerFactory {
        @Override
        public PigServer getPigServer() {
            try {
                return createPigInstance();
            } catch (Exception ex) {
                throw new BeanCreationException("Cannot create PigServer instance", ex);
            }
        }
    };

    public PigServerFactory getObject() throws Exception {
        return new DefaultPigServerFactory();
    }

    public Class<?> getObjectType() {
        return PigServerFactory.class;
    }

    public boolean isSingleton() {
        return true;
    }

    protected PigServer createPigInstance() throws Exception {
        final PigContext ctx = (pigContext != null ? pigContext : new PigContext());

        // apparently if not connected, pig can cause all kind of errors
        PigServer pigServer = null;

        try {
            if (StringUtils.hasText(user)) {
                UserGroupInformation ugi = UserGroupInformation.createProxyUser(user,
                        UserGroupInformation.getLoginUser());
                pigServer = ugi.doAs(new PrivilegedExceptionAction<PigServer>() {
                    @Override
                    public PigServer run() throws Exception {
                        return new PigServer(ctx, true);
                    }
                });
            } else {
                pigServer = new PigServer(ctx, true);
            }
        } catch (ExecException ex) {
            throw PigUtils.convert(ex);
        }

        if (!CollectionUtils.isEmpty(pathToSkip)) {
            for (String path : pathToSkip) {
                pigServer.addPathToSkip(path);
            }
        }

        if (parallelism != null) {
            pigServer.setDefaultParallel(parallelism);
        }

        if (StringUtils.hasText(jobName)) {
            pigServer.setJobName(jobName);
        } else {
            if (StringUtils.hasText(beanName)) {
                pigServer.setJobName(beanName);
            }
        }

        if (StringUtils.hasText(jobPriority)) {
            pigServer.setJobPriority(jobPriority);
        }

        if (validateEachStatement != null) {
            PigUtils.validateEachStatement(pigServer, validateEachStatement);
        }

        if (!CollectionUtils.isEmpty(scripts)) {
            PigUtils.runWithConversion(pigServer, scripts, false);
        }

        return pigServer;
    }

    public void setBeanName(String name) {
        this.beanName = name;
    }

    /**
     * Sets the {@link PigContext} to use.
     * 
     * @param pigContext The pigContext to set.
     */
    public void setPigContext(PigContext pigContext) {
        this.pigContext = pigContext;
    }

    /**
     * Sets the paths to skip.
     * 
     * @param pathToSkip The pathToSkip to set.
     */
    public void setPathsToSkip(Collection<String> pathToSkip) {
        this.pathToSkip = pathToSkip;
    }

    /**
     * Sets the scripts to execute at startup.
     * 
     * @param scripts The scripts to set.
     */
    public void setScripts(Collection<PigScript> scripts) {
        this.scripts = scripts;
    }

    /**
     * Sets the parallelism.
     * 
     * @param parallelism The parallelism to set.
     */
    public void setParallelism(Integer parallelism) {
        this.parallelism = parallelism;
    }

    /**
     * Sets the job name.
     * 
     * @param jobName The jobName to set.
     */
    public void setJobName(String jobName) {
        this.jobName = jobName;
    }

    /**
     * Sets the job priority.
     * 
     * @param jobPriority The jobPriority to set.
     */
    public void setJobPriority(String jobPriority) {
        this.jobPriority = jobPriority;
    }

    /**
     * Indicates whether each statement should be validated or not. By default it is unset,
     * relying on the Pig defaults.
     * 
     * @param validateEachStatement whether to validate each statement or not.
     */
    public void setValidateEachStatement(Boolean validateEachStatement) {
        this.validateEachStatement = validateEachStatement;
    }

    /**
     * Sets the user impersonation (optional) for executing Pig jobs.
     * Should be used when running against a Hadoop Kerberos cluster. 
     * 
     * @param user user/group information
     */
    public void setUser(String user) {
        this.user = user;
    }
}