com.asakusafw.bulkloader.extractor.DfsFileImport.java Source code

Java tutorial

Introduction

Here is the source code for com.asakusafw.bulkloader.extractor.DfsFileImport.java

Source

/**
 * Copyright 2011-2016 Asakusa Framework Team.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.asakusafw.bulkloader.extractor;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.lang.ProcessBuilder.Redirect;
import java.net.URI;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile.CompressionType;

import com.asakusafw.bulkloader.bean.ImportBean;
import com.asakusafw.bulkloader.bean.ImportTargetTableBean;
import com.asakusafw.bulkloader.common.ConfigurationLoader;
import com.asakusafw.bulkloader.common.Constants;
import com.asakusafw.bulkloader.common.FileNameUtil;
import com.asakusafw.bulkloader.exception.BulkLoaderSystemException;
import com.asakusafw.bulkloader.log.Log;
import com.asakusafw.bulkloader.transfer.FileList;
import com.asakusafw.bulkloader.transfer.FileProtocol;
import com.asakusafw.runtime.io.ModelInput;
import com.asakusafw.runtime.io.ModelOutput;
import com.asakusafw.runtime.io.TsvIoFactory;
import com.asakusafw.runtime.stage.temporary.TemporaryStorage;
import com.asakusafw.thundergate.runtime.cache.CacheInfo;
import com.asakusafw.thundergate.runtime.cache.CacheStorage;
import com.asakusafw.thundergate.runtime.cache.mapreduce.CacheBuildClient;

/**
 * ?????????
 * @author yuta.shirai
 */
public class DfsFileImport {

    static final Log LOG = new Log(DfsFileImport.class);

    private static final int INPUT_BUFFER_BYTES = 128 * 1024;

    private final ExecutorService executor;

    private final String cacheBuildCommand;

    /**
     * Creates a new instance.
     */
    public DfsFileImport() {
        File cmd = ConfigurationLoader.getLocalScriptPath(Constants.PATH_LOCAL_CACHE_BUILD);
        this.cacheBuildCommand = cmd.getAbsolutePath();
        int parallel = Integer.parseInt(ConfigurationLoader.getProperty(Constants.PROP_KEY_CACHE_BUILDER_PARALLEL));
        LOG.debugMessage("Building a cache builder with {0} threads", parallel);
        this.executor = Executors.newFixedThreadPool(parallel);
    }

    /**
     * ??DFS????
     * {@link FileList}???????TSV??Model???????????
     * ???????
     * ??????
     * <ul>
     * <li> {@link com.asakusafw.bulkloader.transfer.FileProtocol.Kind#CONTENT} </li>
     * <li> {@link com.asakusafw.bulkloader.transfer.FileProtocol.Kind#CREATE_CACHE} </li>
     * <li> {@link com.asakusafw.bulkloader.transfer.FileProtocol.Kind#UPDATE_CACHE} </li>
     * </ul>
     * @param bean ???Bean
     * @param user OS???
     * @return ?true?false
     */
    public boolean importFile(ImportBean bean, String user) {
        // ?
        FileList.Reader reader;
        try {
            reader = FileList.createReader(getInputStream());
        } catch (IOException e) {
            LOG.error(e, "TG-EXTRACTOR-02001", "?FileList???");
            return false;
        }
        try {
            // FileList????
            List<Future<?>> running = new ArrayList<>();
            while (reader.next()) {
                FileProtocol protocol = reader.getCurrentProtocol();
                try (InputStream content = reader.openContent()) {
                    switch (protocol.getKind()) {
                    case CONTENT:
                        importContent(protocol, content, bean, user);
                        break;

                    case CREATE_CACHE:
                    case UPDATE_CACHE:
                        long recordCount = putCachePatch(protocol, content, bean, user);
                        Callable<?> builder = createCacheBuilder(protocol, bean, user, recordCount);
                        if (builder != null) {
                            LOG.debugMessage("Submitting cache builder: {0} {1}", protocol.getKind(),
                                    protocol.getInfo().getTableName());
                            running.add(executor.submit(builder));
                        }
                        break;

                    default:
                        throw new AssertionError(protocol.getKind());
                    }
                }
            }

            waitForCompleteTasks(bean, running);
            // 
            return true;

        } catch (BulkLoaderSystemException e) {
            LOG.log(e);
        } catch (IOException e) {
            // FileList??
            LOG.error(e, "TG-EXTRACTOR-02001", "?FileList???");
        } finally {
            try {
                reader.close();
            } catch (IOException e) {
                // ????????????
                e.printStackTrace();
            }
        }
        return false;
    }

    private void importContent(FileProtocol protocol, InputStream content, ImportBean bean, String user)
            throws BulkLoaderSystemException {
        assert protocol != null;
        assert content != null;
        assert bean != null;
        assert user != null;
        String tableName = FileNameUtil.getImportTableName(protocol.getLocation());

        ImportTargetTableBean targetTableBean = bean.getTargetTable(tableName);
        if (targetTableBean == null) {
            // ???DSL?????
            throw new BulkLoaderSystemException(getClass(), "TG-EXTRACTOR-02001", MessageFormat.format(
                    "????DSL?????{0}",
                    tableName));
        }

        URI dfsFilePath = resolveLocation(bean, user, targetTableBean.getDfsFilePath());
        Class<?> targetTableModel = targetTableBean.getImportTargetType();

        LOG.info("TG-EXTRACTOR-02002", tableName, dfsFilePath.toString(), targetTableModel.toString());

        // ????
        long recordCount = write(targetTableModel, dfsFilePath, content);

        LOG.info("TG-EXTRACTOR-02003", tableName, dfsFilePath.toString(), targetTableModel.toString());
        LOG.info("TG-PROFILE-01002", bean.getTargetName(), bean.getBatchId(), bean.getJobflowId(),
                bean.getExecutionId(), tableName, recordCount);
    }

    private long putCachePatch(FileProtocol protocol, InputStream content, ImportBean bean, String user)
            throws BulkLoaderSystemException {
        assert protocol != null;
        assert content != null;
        assert bean != null;
        assert user != null;
        assert protocol.getKind() == FileProtocol.Kind.CREATE_CACHE
                || protocol.getKind() == FileProtocol.Kind.UPDATE_CACHE;

        CacheInfo info = protocol.getInfo();
        assert info != null;

        ImportTargetTableBean targetTableBean = bean.getTargetTable(info.getTableName());
        if (targetTableBean == null) {
            // ???DSL?????
            throw new BulkLoaderSystemException(getClass(), "TG-EXTRACTOR-02001", MessageFormat.format(
                    "????DSL?????{0}",
                    info.getTableName()));
        }

        URI dfsFilePath = resolveLocation(bean, user, protocol.getLocation());
        try (CacheStorage storage = new CacheStorage(new Configuration(), dfsFilePath)) {
            LOG.info("TG-EXTRACTOR-11001", info.getId(), info.getTableName(), storage.getPatchProperties());
            storage.putPatchCacheInfo(info);
            LOG.info("TG-EXTRACTOR-11002", info.getId(), info.getTableName(), storage.getPatchProperties());

            Class<?> targetTableModel = targetTableBean.getImportTargetType();
            Path targetUri = storage.getPatchContents("0");
            LOG.info("TG-EXTRACTOR-11003", info.getId(), info.getTableName(), targetUri);
            long recordCount = write(targetTableModel, targetUri.toUri(), content);
            LOG.info("TG-EXTRACTOR-11004", info.getId(), info.getTableName(), targetUri, recordCount);
            LOG.info("TG-PROFILE-01002", bean.getTargetName(), bean.getBatchId(), bean.getJobflowId(),
                    bean.getExecutionId(), info.getTableName(), recordCount);
            return recordCount;
        } catch (IOException e) {
            throw new BulkLoaderSystemException(e, getClass(), "TG-EXTRACTOR-11005", info.getId(),
                    info.getTableName(), dfsFilePath);
        }
    }

    private Callable<?> createCacheBuilder(FileProtocol protocol, ImportBean bean, String user, long recordCount)
            throws BulkLoaderSystemException {
        assert protocol != null;
        assert bean != null;
        assert user != null;
        CacheInfo info = protocol.getInfo();
        URI location = resolveLocation(bean, user, protocol.getLocation());
        assert info != null;
        try {
            switch (protocol.getKind()) {
            case CREATE_CACHE:
                return createCacheBuilder(CacheBuildClient.SUBCOMMAND_CREATE, bean, location, info);
            case UPDATE_CACHE:
                if (recordCount > 0) {
                    return createCacheBuilder(CacheBuildClient.SUBCOMMAND_UPDATE, bean, location, info);
                } else {
                    return null;
                }
            default:
                throw new AssertionError(protocol);
            }
        } catch (IOException e) {
            throw new BulkLoaderSystemException(e, getClass(), "TG-EXTRACTOR-12002", protocol.getKind(),
                    info.getId(), info.getTableName(), bean.getTargetName(), bean.getBatchId(), bean.getJobflowId(),
                    bean.getExecutionId());
        }
    }

    /**
     * Creates a cache builder for the specified cache (of candidate).
     * @param subcommand subcommand name
     * @param bean current importer script
     * @param location cache location
     * @param info cache information
     * @return the future object of the execution, or {@code null} if nothing to do
     * @throws IOException if failed to start execution
     */
    protected Callable<?> createCacheBuilder(final String subcommand, ImportBean bean, final URI location,
            final CacheInfo info) throws IOException {
        assert subcommand != null;
        assert bean != null;
        assert location != null;
        assert info != null;

        List<String> command = new ArrayList<>();
        command.add(cacheBuildCommand);
        command.add(subcommand);
        command.add(bean.getBatchId());
        command.add(bean.getJobflowId());
        command.add(bean.getExecutionId());
        command.add(location.toString());
        command.add(info.getModelClassName());
        command.add(info.getTableName());

        LOG.info("TG-EXTRACTOR-12001", subcommand, info.getId(), info.getTableName(), bean.getTargetName(),
                bean.getBatchId(), bean.getJobflowId(), bean.getExecutionId(), command);

        final ProcessBuilder builder = new ProcessBuilder(command);
        builder.directory(new File(System.getProperty("user.home", ".")));
        return new Callable<Void>() {
            @Override
            public Void call() throws Exception {
                LOG.info("TG-EXTRACTOR-12003", subcommand, info.getId(), info.getTableName());
                Process process = builder.redirectOutput(Redirect.INHERIT).redirectError(Redirect.INHERIT).start();
                try {
                    int exitCode = process.waitFor();
                    if (exitCode != 0) {
                        throw new IOException(
                                MessageFormat.format("Cache builder returns unexpected exit code: {0}", exitCode));
                    }
                    LOG.info("TG-EXTRACTOR-12004", subcommand, info.getId(), info.getTableName());
                } catch (Exception e) {
                    throw new BulkLoaderSystemException(e, DfsFileImport.class, "TG-EXTRACTOR-12005", subcommand,
                            info.getId(), info.getTableName());
                } finally {
                    process.destroy();
                }
                return null;
            }
        };
    }

    /**
     * Resolves target location.
     * @param bean importer bean
     * @param user current user name
     * @param location target location
     * @return the resolved location
     * @throws BulkLoaderSystemException if failed to resolve
     */
    protected URI resolveLocation(ImportBean bean, String user, String location) throws BulkLoaderSystemException {
        Configuration conf = new Configuration();
        URI dfsFilePath = FileNameUtil.createPath(conf, location, bean.getExecutionId(), user).toUri();
        return dfsFilePath;
    }

    private void waitForCompleteTasks(ImportBean bean, List<Future<?>> running) throws BulkLoaderSystemException {
        assert bean != null;
        assert running != null;
        if (running.isEmpty()) {
            return;
        }
        LOG.info("TG-EXTRACTOR-12006", bean.getTargetName(), bean.getBatchId(), bean.getJobflowId(),
                bean.getExecutionId());

        boolean sawError = false;
        LinkedList<Future<?>> rest = new LinkedList<>(running);
        while (rest.isEmpty() == false) {
            Future<?> future = rest.removeFirst();
            try {
                future.get(1, TimeUnit.SECONDS);
            } catch (TimeoutException e) {
                // continue...
                rest.addLast(future);
            } catch (InterruptedException e) {
                cancel(rest);
                throw new BulkLoaderSystemException(e, getClass(), "TG-EXTRACTOR-12007", bean.getTargetName(),
                        bean.getBatchId(), bean.getJobflowId(), bean.getExecutionId());
            } catch (ExecutionException e) {
                cancel(rest);
                Throwable cause = e.getCause();
                if (cause instanceof RuntimeException) {
                    throw (RuntimeException) cause;
                } else if (cause instanceof Error) {
                    throw (Error) cause;
                } else if (cause instanceof BulkLoaderSystemException) {
                    LOG.log((BulkLoaderSystemException) cause);
                    sawError = true;
                } else {
                    LOG.error(e, "TG-EXTRACTOR-12008", bean.getTargetName(), bean.getBatchId(), bean.getJobflowId(),
                            bean.getExecutionId());
                    sawError = true;
                }
            }
        }
        if (sawError) {
            throw new BulkLoaderSystemException(getClass(), "TG-EXTRACTOR-12008", bean.getTargetName(),
                    bean.getBatchId(), bean.getJobflowId(), bean.getExecutionId());
        } else {
            LOG.info("TG-EXTRACTOR-12009", bean.getTargetName(), bean.getBatchId(), bean.getJobflowId(),
                    bean.getExecutionId());
        }
    }

    private void cancel(List<Future<?>> futures) {
        assert futures != null;
        for (Future<?> future : futures) {
            future.cancel(true);
        }
    }

    /**
     * ?TSV??????????
     * @param <T> Import??Model?
     * @param targetTableModel Import??Model?
     * @param dfsFilePath HFSF???
     * @param inputStream FileList
     * @return ?????
     * @throws BulkLoaderSystemException ??????
     */
    protected <T> long write(Class<T> targetTableModel, URI dfsFilePath, InputStream inputStream)
            throws BulkLoaderSystemException {
        Configuration conf = new Configuration();
        TsvIoFactory<T> factory = new TsvIoFactory<>(targetTableModel);
        try (ModelInput<T> input = factory.createModelInput(inputStream)) {
            long count = 0;
            T buffer = factory.createModelObject();
            try (ModelOutput<T> output = TemporaryStorage.openOutput(conf, targetTableModel,
                    new Path(dfsFilePath))) {
                while (input.readTo(buffer)) {
                    count++;
                    output.write(buffer);
                }
            }
            return count;
        } catch (IOException e) {
            throw new BulkLoaderSystemException(e, getClass(), "TG-EXTRACTOR-02001",
                    "DFS??????URI" + dfsFilePath);
        }
    }

    /**
     * ???
     * @param strCompType CompressionType?
     * @return CompressionType
     */
    protected CompressionType getCompType(String strCompType) {
        CompressionType compType = null;
        try {
            compType = CompressionType.valueOf(strCompType);
        } catch (Exception e) {
            compType = CompressionType.NONE;
            LOG.warn("TG-EXTRACTOR-02004", strCompType);
        }
        return compType;
    }

    /**
     * InputStream????
     * @return InputStream
     * @throws IOException if failed to open stream
     */
    protected InputStream getInputStream() throws IOException {
        return new BufferedInputStream(System.in, INPUT_BUFFER_BYTES);
    }
}