org.apache.accumulo.master.tableOps.bulkVer1.BulkImport.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.accumulo.master.tableOps.bulkVer1.BulkImport.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.accumulo.master.tableOps.bulkVer1;

import static org.apache.accumulo.fate.util.UtilWaitThread.sleepUninterruptibly;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;

import org.apache.accumulo.core.Constants;
import org.apache.accumulo.core.client.impl.AcceptableThriftTableOperationException;
import org.apache.accumulo.core.client.impl.Table;
import org.apache.accumulo.core.client.impl.Tables;
import org.apache.accumulo.core.client.impl.thrift.TableOperation;
import org.apache.accumulo.core.client.impl.thrift.TableOperationExceptionType;
import org.apache.accumulo.core.conf.Property;
import org.apache.accumulo.core.file.FileOperations;
import org.apache.accumulo.core.master.state.tables.TableState;
import org.apache.accumulo.core.master.thrift.BulkImportState;
import org.apache.accumulo.core.util.SimpleThreadPool;
import org.apache.accumulo.fate.Repo;
import org.apache.accumulo.master.Master;
import org.apache.accumulo.master.tableOps.MasterRepo;
import org.apache.accumulo.master.tableOps.Utils;
import org.apache.accumulo.server.ServerConstants;
import org.apache.accumulo.server.fs.VolumeManager;
import org.apache.accumulo.server.tablets.UniqueNameAllocator;
import org.apache.accumulo.server.util.MetadataTableUtil;
import org.apache.accumulo.server.zookeeper.TransactionWatcher.ZooArbitrator;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Bulk import makes requests of tablet servers, and those requests can take a long time. Our
 * communications to the tablet server may fail, so we won't know the status of the request. The
 * master will repeat failed requests so now there are multiple requests to the tablet server. The
 * tablet server will not execute the request multiple times, so long as the marker it wrote in the
 * metadata table stays there. The master needs to know when all requests have finished so it can
 * remove the markers. Did it start? Did it finish? We can see that *a* request completed by seeing
 * the flag written into the metadata table, but we won't know if some other rogue thread is still
 * waiting to start a thread and repeat the operation.
 *
 * The master can ask the tablet server if it has any requests still running. Except the tablet
 * server might have some thread about to start a request, but before it has made any bookkeeping
 * about the request. To prevent problems like this, an Arbitrator is used. Before starting any new
 * request, the tablet server checks the Arbitrator to see if the request is still valid.
 */
public class BulkImport extends MasterRepo {
    public static final String FAILURES_TXT = "failures.txt";

    private static final long serialVersionUID = 1L;

    private static final Logger log = LoggerFactory.getLogger(BulkImport.class);

    private Table.ID tableId;
    private String sourceDir;
    private String errorDir;
    private boolean setTime;

    public BulkImport(Table.ID tableId, String sourceDir, String errorDir, boolean setTime) {
        this.tableId = tableId;
        this.sourceDir = sourceDir;
        this.errorDir = errorDir;
        this.setTime = setTime;
    }

    @Override
    public long isReady(long tid, Master master) throws Exception {
        if (!Utils.getReadLock(tableId, tid).tryLock())
            return 100;

        Tables.clearCache(master);
        if (Tables.getTableState(master, tableId) == TableState.ONLINE) {
            long reserve1, reserve2;
            reserve1 = reserve2 = Utils.reserveHdfsDirectory(sourceDir, tid);
            if (reserve1 == 0)
                reserve2 = Utils.reserveHdfsDirectory(errorDir, tid);
            return reserve2;
        } else {
            throw new AcceptableThriftTableOperationException(tableId.canonicalID(), null,
                    TableOperation.BULK_IMPORT, TableOperationExceptionType.OFFLINE, null);
        }
    }

    @Override
    public Repo<Master> call(long tid, Master master) throws Exception {
        log.debug(" tid {} sourceDir {}", tid, sourceDir);

        Utils.getReadLock(tableId, tid).lock();

        // check that the error directory exists and is empty
        VolumeManager fs = master.getFileSystem();

        Path errorPath = new Path(errorDir);
        FileStatus errorStatus = null;
        try {
            errorStatus = fs.getFileStatus(errorPath);
        } catch (FileNotFoundException ex) {
            // ignored
        }
        if (errorStatus == null)
            throw new AcceptableThriftTableOperationException(tableId.canonicalID(), null,
                    TableOperation.BULK_IMPORT, TableOperationExceptionType.BULK_BAD_ERROR_DIRECTORY,
                    errorDir + " does not exist");
        if (!errorStatus.isDirectory())
            throw new AcceptableThriftTableOperationException(tableId.canonicalID(), null,
                    TableOperation.BULK_IMPORT, TableOperationExceptionType.BULK_BAD_ERROR_DIRECTORY,
                    errorDir + " is not a directory");
        if (fs.listStatus(errorPath).length != 0)
            throw new AcceptableThriftTableOperationException(tableId.canonicalID(), null,
                    TableOperation.BULK_IMPORT, TableOperationExceptionType.BULK_BAD_ERROR_DIRECTORY,
                    errorDir + " is not empty");

        ZooArbitrator.start(Constants.BULK_ARBITRATOR_TYPE, tid);
        master.updateBulkImportStatus(sourceDir, BulkImportState.MOVING);
        // move the files into the directory
        try {
            String bulkDir = prepareBulkImport(master, fs, sourceDir, tableId);
            log.debug(" tid {} bulkDir {}", tid, bulkDir);
            return new LoadFiles(tableId, sourceDir, bulkDir, errorDir, setTime);
        } catch (IOException ex) {
            log.error("error preparing the bulk import directory", ex);
            throw new AcceptableThriftTableOperationException(tableId.canonicalID(), null,
                    TableOperation.BULK_IMPORT, TableOperationExceptionType.BULK_BAD_INPUT_DIRECTORY,
                    sourceDir + ": " + ex);
        }
    }

    private Path createNewBulkDir(VolumeManager fs, Table.ID tableId) throws IOException {
        Path tempPath = fs.matchingFileSystem(new Path(sourceDir), ServerConstants.getTablesDirs());
        if (tempPath == null)
            throw new IOException(sourceDir + " is not in a volume configured for Accumulo");

        String tableDir = tempPath.toString();
        if (tableDir == null)
            throw new IOException(sourceDir + " is not in a volume configured for Accumulo");
        Path directory = new Path(tableDir + "/" + tableId);
        fs.mkdirs(directory);

        // only one should be able to create the lock file
        // the purpose of the lock file is to avoid a race
        // condition between the call to fs.exists() and
        // fs.mkdirs()... if only hadoop had a mkdir() function
        // that failed when the dir existed

        UniqueNameAllocator namer = UniqueNameAllocator.getInstance();

        while (true) {
            Path newBulkDir = new Path(directory, Constants.BULK_PREFIX + namer.getNextName());
            if (fs.exists(newBulkDir)) // sanity check
                throw new IOException("Dir exist when it should not " + newBulkDir);
            if (fs.mkdirs(newBulkDir))
                return newBulkDir;
            log.warn("Failed to create {} for unknown reason", newBulkDir);

            sleepUninterruptibly(3, TimeUnit.SECONDS);
        }
    }

    private String prepareBulkImport(Master master, final VolumeManager fs, String dir, Table.ID tableId)
            throws Exception {
        final Path bulkDir = createNewBulkDir(fs, tableId);

        MetadataTableUtil.addBulkLoadInProgressFlag(master,
                "/" + bulkDir.getParent().getName() + "/" + bulkDir.getName());

        Path dirPath = new Path(dir);
        FileStatus[] mapFiles = fs.listStatus(dirPath);

        final UniqueNameAllocator namer = UniqueNameAllocator.getInstance();

        int workerCount = master.getConfiguration().getCount(Property.MASTER_BULK_RENAME_THREADS);
        SimpleThreadPool workers = new SimpleThreadPool(workerCount, "bulk move");
        List<Future<Exception>> results = new ArrayList<>();

        for (FileStatus file : mapFiles) {
            final FileStatus fileStatus = file;
            results.add(workers.submit(() -> {
                try {
                    String sa[] = fileStatus.getPath().getName().split("\\.");
                    String extension = "";
                    if (sa.length > 1) {
                        extension = sa[sa.length - 1];

                        if (!FileOperations.getValidExtensions().contains(extension)) {
                            log.warn("{} does not have a valid extension, ignoring", fileStatus.getPath());
                            return null;
                        }
                    } else {
                        // assume it is a map file
                        extension = Constants.MAPFILE_EXTENSION;
                    }

                    if (extension.equals(Constants.MAPFILE_EXTENSION)) {
                        if (!fileStatus.isDirectory()) {
                            log.warn("{} is not a map file, ignoring", fileStatus.getPath());
                            return null;
                        }

                        if (fileStatus.getPath().getName().equals("_logs")) {
                            log.info("{} is probably a log directory from a map/reduce task, skipping",
                                    fileStatus.getPath());
                            return null;
                        }
                        try {
                            FileStatus dataStatus = fs
                                    .getFileStatus(new Path(fileStatus.getPath(), MapFile.DATA_FILE_NAME));
                            if (dataStatus.isDirectory()) {
                                log.warn("{} is not a map file, ignoring", fileStatus.getPath());
                                return null;
                            }
                        } catch (FileNotFoundException fnfe) {
                            log.warn("{} is not a map file, ignoring", fileStatus.getPath());
                            return null;
                        }
                    }

                    String newName = "I" + namer.getNextName() + "." + extension;
                    Path newPath = new Path(bulkDir, newName);
                    try {
                        fs.rename(fileStatus.getPath(), newPath);
                        log.debug("Moved {} to {}", fileStatus.getPath(), newPath);
                    } catch (IOException E1) {
                        log.error("Could not move: {} {}", fileStatus.getPath(), E1.getMessage());
                    }

                } catch (Exception ex) {
                    return ex;
                }
                return null;
            }));
        }
        workers.shutdown();
        while (!workers.awaitTermination(1000L, TimeUnit.MILLISECONDS)) {
        }

        for (Future<Exception> ex : results) {
            if (ex.get() != null) {
                throw ex.get();
            }
        }
        return bulkDir.toString();
    }

    @Override
    public void undo(long tid, Master environment) throws Exception {
        // unreserve source/error directories
        Utils.unreserveHdfsDirectory(sourceDir, tid);
        Utils.unreserveHdfsDirectory(errorDir, tid);
        Utils.getReadLock(tableId, tid).unlock();
        ZooArbitrator.cleanup(Constants.BULK_ARBITRATOR_TYPE, tid);
    }
}