Source code

Java tutorial


Here is the source code for


 * (C) Copyright IBM Corp. 2015, 2016
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.


import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;

import org.javaswift.joss.client.factory.AccountConfig;
import org.javaswift.joss.client.factory.AccountFactory;
import org.javaswift.joss.client.factory.AuthenticationMethod;
import org.javaswift.joss.model.Access;
import org.javaswift.joss.model.Account;
import org.javaswift.joss.model.Container;
import org.javaswift.joss.model.DirectoryOrObject;
import org.javaswift.joss.model.PaginationMap;
import org.javaswift.joss.model.StoredObject;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.fs.FileSystem.Statistics;



import static;
import static;
import static;
import static;
import static;
import static;
import static;
import static;
import static;
import static;
import static;
// import static;
// import static;
// import static;
import static;
import static;
import static;
import static;
import static;
import static;
import static;

 * Swift back-end driver
public class SwiftAPIClient implements IStoreClient {

     * Logger
    private static final Logger LOG = LoggerFactory.getLogger(SwiftAPIClient.class);
     * Time pattern
    private static final String TIME_PATTERN = "EEE, d MMM yyyy hh:mm:ss zzz";
     * root container
    private final String container;
     * should use public or private URL
    private final boolean usePublicURL;
     * JOSS account object
    private Account mAccount;
     * JOSS authentication object
    private Access mAccess;
     * block size
    private long blockSize;
     * The version of the CSVStorlet
    private int storletVersion;
     * the record delimiter for CSV files
    private String theRecordDelimiter;
     * the max CSV record size
    private long theMaxRecordSize;
     * the dynamic log debug level for CSV storlet
    private int dynamicStorletDebug;

     * If true, automatic delete will be activated on the
     * data generated from failed tasks
    private boolean fModeAutomaticDelete;

     * Contains map of object names that were written by Spark.
     * Used in container listing
    private Map<String, Boolean> cachedSparkOriginated;

     * Contains map of objects that were created by successfull Spark jobs.
     * Used in container listing
    private Map<String, Boolean> cachedSparkJobsStatus;

     * Page size for container listing
    private final int pageListSize = 100;

     * Constructor method
     * @param filesystemURI
     * @param conf Configuration
     * @throws IOException
    public SwiftAPIClient(URI filesystemURI, Configuration conf) throws IOException {
        String preferredRegion = null;
        Properties props = ConfigurationHandler.initialize(filesystemURI, conf);
        container = props.getProperty(SWIFT_CONTAINER_PROPERTY);
        String isPubProp = props.getProperty(SWIFT_PUBLIC_PROPERTY, "false");
        usePublicURL = "true".equals(isPubProp);
        // SWIFT_BLOCK_SIZE_PROPERTY value is in bytes. The retained effective value is
        long rawBlockSize = Long.valueOf(props.getProperty(SWIFT_BLOCK_SIZE_PROPERTY, SWIFT_BLOCK_SIZE_DEFAULT))

        // The lower 10 bytes of rawBlockSize are used to encode development info
        // such as the CSVStorlet version to be used and the dynamic debug level
        blockSize = (rawBlockSize / 1024) * 1024; // erase the lower 10 bits
        storletVersion = 1 + ((int) rawBlockSize % 16);

        dynamicStorletDebug = (int) rawBlockSize;
        dynamicStorletDebug = dynamicStorletDebug % 64;
        dynamicStorletDebug = dynamicStorletDebug >> 4;

        LOG.warn(SWIFT_BLOCK_SIZE_PROPERTY + " --> " + blockSize);
        LOG.warn(" Dynamic storlet debug --> " + dynamicStorletDebug);

        // Following are needed for the invocation of the CSV SQL pushdown storlet:
        theRecordDelimiter = DEFAULT_RECORD_DELIMITER;
        theMaxRecordSize = DEFAULT_MAX_RECORD_SIZE;

        LOG.warn("swift_csv_record_delimiter_property --> " + theRecordDelimiter);
        LOG.warn("swift_max_record_size_property  --> " + theMaxRecordSize);
        LOG.warn("swift_dynamic_storlet_debug_property --> " + dynamicStorletDebug);
        LOG.warn("CSVStorlet version  --> " + storletVersion);

        AccountConfig config = new AccountConfig();
        config.setAuthUrl(Utils.getOption(props, SWIFT_AUTH_PROPERTY));
        String authMethod = props.getProperty(SWIFT_AUTH_METHOD_PROPERTY);
        fModeAutomaticDelete = "true".equals(props.getProperty(FMODE_AUTOMATIC_DELETE_PROPERTY, "false"));
        if (authMethod.equals("keystone")) {
            preferredRegion = props.getProperty(SWIFT_REGION_PROPERTY);
            if (preferredRegion != null) {
            config.setUsername(Utils.getOption(props, SWIFT_USERNAME_PROPERTY));
        } else if (authMethod.equals(KEYSTONE_V3_AUTH)) {
            preferredRegion = props.getProperty(SWIFT_REGION_PROPERTY, "dallas");
            String userId = props.getProperty(SWIFT_USER_ID_PROPERTY);
            String projectId = props.getProperty(SWIFT_PROJECT_ID_PROPERTY);
            PasswordScopeAccessProvider psap = new PasswordScopeAccessProvider(userId, config.getPassword(),
                    projectId, config.getAuthUrl(), preferredRegion);
        } else {
            config.setTenantName(Utils.getOption(props, SWIFT_USERNAME_PROPERTY));
        ObjectMapper mapper = new ObjectMapper();
        mapper.configure(SerializationConfig.Feature.WRAP_ROOT_VALUE, true);
        mAccount = new AccountFactory(config).createAccount();
        mAccess = mAccount.authenticate();
        if (preferredRegion != null) {
        cachedSparkOriginated = new HashMap<String, Boolean>();
        cachedSparkJobsStatus = new HashMap<String, Boolean>();
        Container containerObj = mAccount.getContainer(container);
        if (!containerObj.exists()) {

    public String getScheme() {
        return Constants.SWIFT;

    public String getDataRoot() {
        return container;

    public long getBlockSize() {
        return blockSize;

    public int getDynamicStorletDebug() {
        return dynamicStorletDebug;

    public int getStorletVersion() {
        return storletVersion;

    public long getMaxRecordSize() {
        return theMaxRecordSize;

    public String getCsvRecordDelimiter() {
        return theRecordDelimiter;

    public Account getAccount() {
        return mAccount;

    public FileStatus getObjectMetadata(String hostName, Path path) throws IOException, FileNotFoundException {
        LOG.debug("Get object metadata: {}, hostname: {}", path, hostName);
        Container cont = mAccount.getContainer(container);
          The requested path is equal to hostName.
          HostName is equal to hostNameScheme, thus the container.
          Therefore we have no object to look for and
          we return the FileStatus as a directory.
          Containers have to lastModified.
        if (path.toString().equals(hostName)) {
            LOG.debug("Object metadata requested on container!");
            return new FileStatus(0L, true, 1, blockSize, 0L, path);
          The requested path is not equal to the container.
          We need to check if the object requested is a real object or a directory.
          This may be triggered when users want to access a directory rather than
          the entire container.
          A directory in Swift can have two implementations:
          1) a zero byte object with the name of the directory
          2) no zero byte object with the name of the directory
        String objectName = path.toString().substring(hostName.length());
        if (objectName.endsWith("/")) {
              removing the trailing slash because it is not supported in Swift
              an request on an object (not a container) that has a trailing slash will lead
              to a 404 response message
            objectName = objectName.substring(0, objectName.length() - 1);
        StoredObject so = cont.getObject(objectName);
        boolean isDirectory = false;
        if (so.exists()) {
            // We need to check if the object size is equal to zero
            // If so, it might be a directory
            long contentLength = so.getContentLength();
            String lastModified = so.getLastModified();
            if (contentLength == 0) {
                Collection<DirectoryOrObject> directoryFiles = cont.listDirectory(objectName, '/', "", 10);
                if (directoryFiles != null && directoryFiles.size() != 0) {
                    // The zero length object is a directory
                    isDirectory = true;
            LOG.trace("Got object. isDirectory: {}  lastModified: {}", isDirectory, lastModified);
            return new FileStatus(contentLength, isDirectory, 1, blockSize, getLastModified(lastModified), path);
        // We need to check if it may be a directory with no zero byte file associated
        Collection<DirectoryOrObject> directoryFiles = cont.listDirectory(objectName, '/', "", 10);
        if (directoryFiles != null && directoryFiles.size() != 0) {
            // In this case there is no lastModified
            LOG.debug("Got object. isDirectory: {}  lastModified: {}", isDirectory, null);
            return new FileStatus(0, isDirectory, 1, blockSize, 0L, path);
        LOG.debug("Not found {}", path.toString());
        return null;

     * Transforms last modified time stamp from String to the long format
     * @param strTime time in string format as returned from Swift
     * @return time in long format
     * @throws IOException
    private long getLastModified(String strTime) throws IOException {
        final SimpleDateFormat simpleDateFormat = new SimpleDateFormat(TIME_PATTERN);
        try {
            long lastModified = simpleDateFormat.parse(strTime).getTime();
            if (lastModified == 0) {
                lastModified = System.currentTimeMillis();
            return lastModified;
        } catch (ParseException e) {
            throw new IOException("Failed to parse " + strTime, e);

    public boolean exists(String hostName, Path path) throws IOException, FileNotFoundException {
        LOG.trace("Object exists: {}", path);
        StoredObject so = mAccount.getContainer(container).getObject(path.toString().substring(hostName.length()));
        return so.exists();

    public FSDataInputStream getObject(String hostName, Path path) throws IOException {
        // SwiftInputStream.printStackTrace(" #### getOject hostname = " + hostName + " path= " + path);
        LOG.debug("Get object: {}", path);
        try {
            SwiftInputStream sis = new SwiftPushdownInputStream(this, hostName, path);
            return new FSDataInputStream(sis);
        } catch (IOException e) {
        return null;

     * {@inheritDoc}
     * some examples of failed attempts:
     * a/b/
     * a/b/
     * a/b/
     * a/b/
     * or
     * a/b/
     * .csv-attempt_201603171328_0000_m_000000_1
     * a/b/
     * .csv-attempt_201603171328_0000_m_000000_0
     * in all the cases format is objectname-taskid where
     * taskid may vary, depends how many tasks were re-submitted
     * @param hostName
     * @param path
     * @param fullListing
     * @return Array of Hadoop FileStatus
     * @throws IOException
    public FileStatus[] list(String hostName, Path path, boolean fullListing) throws IOException {
        LOG.debug("List container: raw path parent", path.toString());
        Container cObj = mAccount.getContainer(container);
        String obj;
        if (path.toString().startsWith(container)) {
            obj = path.toString().substring(container.length() + 1);
        } else {
            obj = path.toString().substring(hostName.length());
        LOG.debug("List container for {} container {}", obj, container);
        ArrayList<FileStatus> tmpResult = new ArrayList<FileStatus>();
        PaginationMap paginationMap = cObj.getPaginationMap(obj, pageListSize);
        FileStatus fs = null;
        StoredObject previousElement = null;
        for (Integer page = 0; page < paginationMap.getNumberOfPages(); page++) {
            Collection<StoredObject> res = cObj.list(paginationMap, page);
            if (page == 0 && (res == null || res.isEmpty())) {
                FileStatus[] emptyRes = {};
                LOG.debug("List {} in container {} is empty", obj, container);
                return emptyRes;
            for (StoredObject tmp : res) {
                if (previousElement == null) {
                    // first entry
                    setCorrectSize(tmp, cObj);
                    previousElement = tmp.getAsObject();
                String unifiedObjectName = extractUnifiedObjectName(tmp.getName());
                if (isSparkOrigin(unifiedObjectName) && !fullListing) {
                    LOG.trace("{} created by Spark", unifiedObjectName);
                    if (!isJobSuccessfull(unifiedObjectName)) {
                        LOG.trace("{} created by failed Spark job. Skipped", unifiedObjectName);
                        if (fModeAutomaticDelete) {
                            delete(hostName, new Path(tmp.getName()), true);
                    } else {
                        // if we here - data created by spark and job completed successfully
                        // however there be might parts of failed tasks that were not aborted
                        // we need to make sure there are no failed attempts
                        if (nameWithoutTaskID(tmp.getName()).equals(nameWithoutTaskID(previousElement.getName()))) {
                            // found failed that was not aborted.
                            LOG.trace("Collisiion found between {} and {}", previousElement.getName(),
                            setCorrectSize(tmp, cObj);
                            if (previousElement.getContentLength() < tmp.getContentLength()) {
                                LOG.trace("New canditate is {}. Removed {}", tmp.getName(),
                                previousElement = tmp.getAsObject();
                fs = null;
                if (previousElement.getContentLength() > 0 || fullListing) {
                    fs = getFileStatus(previousElement, cObj, hostName, path);
                previousElement = tmp.getAsObject();
        if (previousElement != null && (previousElement.getContentLength() > 0 || fullListing)) {
            fs = getFileStatus(previousElement, cObj, hostName, path);
        LOG.debug("Listing of {} completed with {} results", path.toString(), tmpResult.size());
        return tmpResult.toArray(new FileStatus[tmpResult.size()]);

     * Merge between two paths
     * @param hostName
     * @param p path
     * @param objectName
     * @return merged path
    private String getMergedPath(String hostName, Path p, String objectName) {
        if ((p.getParent() != null) && (p.getName() != null) && (p.getParent().toString().equals(hostName))) {
            if (objectName.equals(p.getName())) {
                return p.toString();
            if (objectName.startsWith(p.getName())) {
                return p.getParent() + objectName;
            return p.toString();
        return hostName + objectName;

     * Direct HTTP PUT request without JOSS package
     * @param objName name of the object
     * @param contentType content type
     * @return HttpURLConnection
    public FSDataOutputStream createObject(String objName, String contentType, Map<String, String> metadata,
            Statistics statistics) throws IOException {
        URL url = new URL(getAccessURL() + "/" + objName);
        LOG.debug("PUT {}. Content-Type : {}", url.toString(), contentType);
        try {
            HttpURLConnection httpCon = (HttpURLConnection) url.openConnection();
            httpCon.addRequestProperty("X-Auth-Token", getAuthToken());
            httpCon.addRequestProperty("Content-Type", contentType);
            if (metadata != null && !metadata.isEmpty()) {
                for (Map.Entry<String, String> entry : metadata.entrySet()) {
                    httpCon.addRequestProperty("X-Object-Meta-" + entry.getKey(), entry.getValue());
            return new FSDataOutputStream(new SwiftOutputStream(httpCon), statistics);
        } catch (IOException e) {
            throw e;

    public boolean delete(String hostName, Path path, boolean recursive) throws IOException {
        String obj = path.toString();
        if (path.toString().startsWith(hostName)) {
            obj = path.toString().substring(hostName.length());
        LOG.debug("Object name to delete {}. Path {}", obj, path.toString());
        StoredObject so = mAccount.getContainer(container).getObject(obj);
        if (so.exists()) {
        return true;

    private String getAuthToken() {
        return mAccess.getToken();

     * Get authenticated URL
    private String getAccessURL() {
        if (usePublicURL) {
            return mAccess.getPublicURL();
        return mAccess.getInternalURL();

     * {@inheritDoc}
     * Swift driver doesn't require any local working directory
     * @return path to the working directory
    public Path getWorkingDirectory() {
        return null;

     * Checks if container/object exists and verifies
     * that it contains Data-Origin=stocator metadata
     * If so, object was created by Spark.
     * @param objectName
     * @return boolean if object was created by Spark
    private boolean isSparkOrigin(String objectName) {
        if (cachedSparkOriginated.containsKey(objectName)) {
            return cachedSparkOriginated.get(objectName).booleanValue();
        String obj = objectName;
        if (objectName.toString().startsWith(container)) {
            obj = objectName.substring(container.length() + 1);
        Boolean sparkOriginated = Boolean.FALSE;
        StoredObject so = mAccount.getContainer(container).getObject(obj);
        if (so.exists()) {
            Object sparkOrigin = so.getMetadata("Data-Origin");
            if (sparkOrigin != null) {
                String tmp = (String) sparkOrigin;
                if (tmp.equals("stocator")) {
                    sparkOriginated = Boolean.TRUE;
        cachedSparkOriginated.put(objectName, sparkOriginated);
        return sparkOriginated.booleanValue();

     * Checks if container/object contains
     * container/object/_SUCCESS
     * If so, this object was created by successful Hadoop job
     * @param objectName
     * @return boolean if job is successful
    private boolean isJobSuccessfull(String objectName) {
        if (cachedSparkJobsStatus.containsKey(objectName)) {
            return cachedSparkJobsStatus.get(objectName).booleanValue();
        String obj = objectName;
        if (objectName.toString().startsWith(container)) {
            obj = objectName.substring(container.length() + 1);
        StoredObject so = mAccount.getContainer(container).getObject(obj + "/" + HADOOP_SUCCESS);
        Boolean isJobOK = Boolean.FALSE;
        if (so.exists()) {
            isJobOK = Boolean.TRUE;
        cachedSparkJobsStatus.put(objectName, isJobOK);
        return isJobOK.booleanValue();

     * Accepts any object name.
     * If object name of the form
     * a/b/c/
     *    .csv-attempt_20160317132a_wrong_0000_m_000000_1
     * Then a/b/c/ is returned.
     * Code testing that attempt_20160317132a_wrong_0000_m_000000_1 is valid
     * task id identifier
     * @param objectName
     * @return unified object name
    private String extractUnifiedObjectName(String objectName) {
        Path p = new Path(objectName);
        if (objectName.indexOf("-" + HADOOP_ATTEMPT) > 0) {
            String attempt = objectName.substring(objectName.lastIndexOf("-") + 1);
            try {
                return p.getParent().toString();
            } catch (IllegalArgumentException e) {
                return objectName;
        } else if (objectName.indexOf(HADOOP_SUCCESS) > 0) {
            return p.getParent().toString();
        return objectName;

     * Accepts any object name.
     * If object name is of the form
     * a/b/c/
     *    .csv-attempt_20160317132a_wrong_0000_m_000000_1
     * Then a/b/c/ is returned.
     * Perform test that attempt_20160317132a_wrong_0000_m_000000_1 is valid
     * task id identifier
     * @param objectName
     * @return unified object name
    private String nameWithoutTaskID(String objectName) {
        int index = objectName.indexOf("-" + HADOOP_ATTEMPT);
        if (index > 0) {
            String attempt = objectName.substring(objectName.lastIndexOf("-") + 1);
            try {
                return objectName.substring(0, index);
            } catch (IllegalArgumentException e) {
                return objectName;
        return objectName;

     * Swift has a bug where container listing might wrongly report size 0
     * for large objects. It's seems to be a well known issue in Swift without
     * solution.
     * We have to provide work around for this.
     * If container listing reports size 0 for some object, we send
     * additional HEAD on that object to verify it's size.
     * @param tmp JOSS StoredObject
     * @param cObj JOSS Container object
    private void setCorrectSize(StoredObject tmp, Container cObj) {
        long objectSize = tmp.getContentLength();
        if (objectSize == 0) {
            // we may hit a well known Swift bug.
            // container listing reports 0 for large objects.
            StoredObject soDirect = cObj.getObject(tmp.getName());
            if (soDirect.getContentLength() > 0) {

     * Maps StoredObject of JOSS into Hadoop FileStatus
     * @param tmp
     * @param cObj
     * @param hostName
     * @param path
     * @return FileStatus representing current object
     * @throws IllegalArgumentException
     * @throws IOException
    private FileStatus getFileStatus(StoredObject tmp, Container cObj, String hostName, Path path)
            throws IllegalArgumentException, IOException {
        String newMergedPath = getMergedPath(hostName, path, tmp.getName());
        return new FileStatus(tmp.getContentLength(), false, 1, blockSize, getLastModified(tmp.getLastModified()),
                0, null, null, null, new Path(newMergedPath));