org.apache.streams.hdfs.WebHdfsPersistWriter.java Source code

Introduction

Here is the source code for org.apache.streams.hdfs.WebHdfsPersistWriter.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.streams.hdfs;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Strings;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.web.WebHdfsFileSystem;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.streams.core.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.Closeable;
import java.io.Flushable;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.net.URI;
import java.net.URISyntaxException;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Queue;

public class WebHdfsPersistWriter implements StreamsPersistWriter, Flushable, Closeable, DatumStatusCountable {
    public final static String STREAMS_ID = "WebHdfsPersistWriter";

    private final static Logger LOGGER = LoggerFactory.getLogger(WebHdfsPersistWriter.class);

    private final static char DELIMITER = '\t';
    private final static int DEFAULT_LINES_PER_FILE = 50000;

    private FileSystem client;
    private Path path;
    private String filePart = "default";
    private int linesPerFile;
    private int totalRecordsWritten = 0;
    private final List<Path> writtenFiles = new ArrayList<Path>();
    private int fileLineCounter = 0;
    private OutputStreamWriter currentWriter = null;

    private static final int BYTES_IN_MB = 1024 * 1024;
    private static final int BYTES_BEFORE_FLUSH = 64 * BYTES_IN_MB;
    private volatile int totalByteCount = 0;
    private volatile int byteCount = 0;

    public boolean terminate = false;

    protected volatile Queue<StreamsDatum> persistQueue;

    private ObjectMapper mapper = new ObjectMapper();

    private HdfsWriterConfiguration hdfsConfiguration;

    public WebHdfsPersistWriter(HdfsWriterConfiguration hdfsConfiguration) {
        this.hdfsConfiguration = hdfsConfiguration;
        this.linesPerFile = hdfsConfiguration.getLinesPerFile().intValue();
    }

    public URI getURI() throws URISyntaxException {
        StringBuilder uriBuilder = new StringBuilder();
        uriBuilder.append(hdfsConfiguration.getScheme());
        uriBuilder.append("://");
        if (!Strings.isNullOrEmpty(hdfsConfiguration.getHost()))
            uriBuilder.append(hdfsConfiguration.getHost() + ":" + hdfsConfiguration.getPort());
        else
            uriBuilder.append("/");
        return new URI(uriBuilder.toString());
    }

    public boolean isConnected() {
        return (client != null);
    }

    public final synchronized FileSystem getFileSystem() {
        // Check to see if we are connected.
        if (!isConnected())
            connectToWebHDFS();
        return this.client;
    }

    private synchronized void connectToWebHDFS() {
        try {
            LOGGER.info("User : {}", this.hdfsConfiguration.getUser());
            UserGroupInformation ugi = UserGroupInformation.createRemoteUser(this.hdfsConfiguration.getUser());
            ugi.setAuthenticationMethod(UserGroupInformation.AuthenticationMethod.SIMPLE);

            ugi.doAs(new PrivilegedExceptionAction<Void>() {
                public Void run() throws Exception {
                    Configuration conf = new Configuration();
                    conf.set(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION, "kerberos");
                    LOGGER.info("WebURI : {}", getURI().toString());
                    client = FileSystem.get(getURI(), conf);
                    LOGGER.info("Connected to WebHDFS");

                    /*
                    * ************************************************************************************************
                    * This code is an example of how you would work with HDFS and you weren't going over
                    * the webHDFS protocol.
                    *
                    * Smashew: 2013-10-01
                    * ************************************************************************************************
                    conf.set("fs.defaultFS", "hdfs://hadoop.mdigitallife.com:8020/user/" + userName);
                    conf.set("namenode.host","0.0.0.0");
                    conf.set("hadoop.job.ugi", userName);
                    conf.set(DFSConfigKeys.DFS_NAMENODE_USER_NAME_KEY, "runner");
                    fileSystem.createNewFile(new Path("/user/"+ userName + "/test"));
                    FileStatus[] status = fs.listStatus(new Path("/user/" + userName));
                    for(int i=0;i<status.length;i++)
                    {
                    LOGGER.info("Directory: {}", status[i].getPath());
                    }
                    */
                    return null;
                }
            });
        } catch (Exception e) {
            LOGGER.error("There was an error connecting to WebHDFS, please check your settings and try again", e);
            throw new RuntimeException(e);
        }
    }

    @Override
    public void write(StreamsDatum streamsDatum) {

        synchronized (this) {
            // Check to see if we need to reset the file that we are currently working with
            if (this.currentWriter == null || (this.fileLineCounter > this.linesPerFile))
                resetFile();

            String line = convertResultToString(streamsDatum);
            writeInternal(line);
            int bytesInLine = line.getBytes().length;

            totalRecordsWritten++;
            totalByteCount += bytesInLine;
            byteCount += bytesInLine;

            if (byteCount > BYTES_BEFORE_FLUSH)
                try {
                    flush();
                } catch (IOException e) {
                    LOGGER.warn(
                            "Error flushing to HDFS. Creating a new file and continuing execution.  WARNING: There could be data loss.",
                            e);
                }

            this.fileLineCounter++;
        }
    }

    private void writeInternal(String line) {
        try {
            this.currentWriter.write(line);
        } catch (IOException e) {
            LOGGER.warn("Error writing to HDFS.  Attempting to try a new file", e);
            try {
                resetFile();
                this.currentWriter.write(line);
            } catch (Exception io) {
                LOGGER.warn("Failed to write even after creating a new file.  Attempting to reconnect", io);
                try {
                    connectToWebHDFS();
                    resetFile();
                    this.currentWriter.write(line);
                } catch (Exception ex) {
                    LOGGER.error("Failed to write to HDFS after reconnecting client. Terminating writer.", ex);
                    throw new RuntimeException(e);
                }
            }

        }
    }

    public void flush() throws IOException {
        if (this.currentWriter != null && byteCount > BYTES_BEFORE_FLUSH) {
            this.currentWriter.flush();
            byteCount = 0;
        }
    }

    private synchronized void resetFile() {
        // this will keep it thread safe, so we don't create too many files
        if (this.fileLineCounter == 0 && this.currentWriter != null)
            return;

        // Create the path for where the file is going to live.
        Path filePath = this.path
                .suffix("/" + hdfsConfiguration.getWriterFilePrefix() + "-" + new Date().getTime() + ".tsv");

        try {

            // if there is a current writer, we must close it first.
            if (this.currentWriter != null) {
                flush();
                close();
            }

            this.fileLineCounter = 0;

            // Check to see if a file of the same name exists, if it does, then we are not going to be able to proceed.
            if (client.exists(filePath))
                throw new RuntimeException("Unable to create file: " + filePath);

            this.currentWriter = new OutputStreamWriter(client.create(filePath));

            // Add another file to the list of written files.
            writtenFiles.add(filePath);

            LOGGER.info("File Created: {}", filePath);
        } catch (Exception e) {
            LOGGER.error("COULD NOT CreateFile: {}", filePath);
            LOGGER.error(e.getMessage());
            throw new RuntimeException(e);
        }
    }

    public synchronized void close() throws IOException {
        if (this.currentWriter != null) {
            this.currentWriter.flush();
            this.currentWriter.close();
            this.currentWriter = null;
            LOGGER.info("File Closed");
        }
    }

    private String convertResultToString(StreamsDatum entry) {
        String metadata = null;
        try {
            metadata = mapper.writeValueAsString(entry.getMetadata());
        } catch (JsonProcessingException e) {
            LOGGER.warn("Error converting metadata to a string", e);
        }

        String documentJson = null;
        try {
            documentJson = mapper.writeValueAsString(entry.getDocument());
        } catch (JsonProcessingException e) {
            LOGGER.warn("Error converting document to string", e);
        }

        if (Strings.isNullOrEmpty(documentJson))
            return null;
        else
            return new StringBuilder().append(entry.getId()).append(DELIMITER).append(entry.getTimestamp())
                    .append(DELIMITER).append(metadata).append(DELIMITER).append(documentJson).append("\n")
                    .toString();
    }

    @Override
    public void prepare(Object configurationObject) {
        connectToWebHDFS();
        path = new Path(hdfsConfiguration.getPath() + "/" + hdfsConfiguration.getWriterPath());
    }

    @Override
    public void cleanUp() {
        try {
            flush();
        } catch (IOException e) {
            LOGGER.error("Error flushing on cleanup", e);
        }
        try {
            close();
        } catch (IOException e) {
            LOGGER.error("Error closing on cleanup", e);
        }
    }

    @Override
    public DatumStatusCounter getDatumStatusCounter() {
        DatumStatusCounter counters = new DatumStatusCounter();
        counters.incrementAttempt(this.totalRecordsWritten);
        counters.incrementStatus(DatumStatus.SUCCESS, this.totalRecordsWritten);
        return counters;
    }
}