org.opencloudengine.flamingo.mapreduce.util.HdfsUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.opencloudengine.flamingo.mapreduce.util.HdfsUtils.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.opencloudengine.flamingo.mapreduce.util;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
import org.apache.hadoop.mapreduce.InputSplit;
import org.opencloudengine.flamingo.mapreduce.util.filter.BypassPathFilter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * HDFS Utility.
 *
 * @author Edward KIM
 * @author Seo Ji Hye
 * @since 0.1
 */
public class HdfsUtils {

    /**
     * SLF4J Logging
     */
    private static Logger logger = LoggerFactory.getLogger(HdfsUtils.class);

    public static final String DEFAULT_UGI = "hadoop,hadoop";

    public static final String HDFS_URL_PREFIX = "hdfs://";

    /**
     * Hadoop HDFS? DFS Client ?.
     *
     * @param hdfsUrl HDFS URL
     * @return DFS Client
     * @throws java.io.IOException DFS Client ?   
     */
    public static DFSClient createDFSClient(String hdfsUrl) throws IOException {
        if (hdfsUrl == null || !hdfsUrl.startsWith("hdfs://")) {
            throw new IllegalArgumentException(
                    "HDFS URL? ?.  HDFS URL [" + hdfsUrl + "]");
        }
        String url = StringUtils.replace(hdfsUrl, "hdfs://", "");
        String[] parts = url.split(":");
        return createDFSClient(parts[0], Integer.valueOf(parts[1]));
    }

    /**
     * Hadoop HDFS? DFS Client ?.
     *
     * @param namenodeIp   Namenode IP
     * @param namenodePort Namenode Port
     * @return DFS Client
     * @throws java.io.IOException DFS Client ?   
     */
    public static DFSClient createDFSClient(String namenodeIp, int namenodePort) throws IOException {
        Configuration config = new Configuration();
        InetSocketAddress address = new InetSocketAddress(namenodeIp, namenodePort);
        return new DFSClient(address, config);
    }

    /**
     *   .
     *
     * @param client    DFS Client
     * @param path       
     * @param recursive Recusive ? 
     * @return  <tt>true</tt>
     * @throws java.io.IOException ??    
     */
    public static boolean remove(DFSClient client, String path, boolean recursive) throws IOException {
        if (client.exists(path)) {
            logger.info(" [{}] ??  . Recursive  [{}]", path,
                    recursive);
            return client.delete(path, recursive);
        }
        logger.info(" [{}] ??  .", path);
        return false;
    }

    /**
     *  ? ? ?? .
     *
     * @param fs   FileSystem
     * @param path 
     * @return ?  ?
     * @throws java.io.IOException HDFS IO    
     */
    public static List<String> listFiles(FileSystem fs, String path) throws IOException {
        List<String> list = new ArrayList<String>();
        FileStatus[] statuses = fs.listStatus(new Path(path));
        if (statuses != null) {
            for (FileStatus status : statuses) {
                if (!status.isDir()) {
                    String fullyQualifiedHDFSFilename = path + "/" + status.getPath().getName();
                    list.add(fullyQualifiedHDFSFilename);
                }
            }
        }
        return list;
    }

    /**
     * DFS Client?  ? .
     *
     * @param client    DFS Client
     * @param filename  ?
     * @param overwrite Overwrite 
     * @return  
     * @throws java.io.IOException HDFS IO    
     */
    public static OutputStream getOutputStream(DFSClient client, String filename, boolean overwrite)
            throws IOException {
        return client.create(filename, overwrite);
    }

    /**
     * DFS Client?  ? .
     *
     * @param client   DFS Client
     * @param filename ? 
     * @return  
     * @throws java.io.IOException HDFS IO    
     */
    public static InputStream getInputStream(DFSClient client, String filename) throws IOException {
        return client.open(filename);
    }

    /**
     *  ? .
     *
     * @param outputStream  
     * @throws java.io.IOException  ?    
     */
    public static void closeOuputStream(OutputStream outputStream) throws IOException {
        outputStream.close();
    }

    /**
     *  ? .
     *
     * @param inputStream  
     * @throws java.io.IOException  ?    
     */
    public static void closeInputStream(InputStream inputStream) throws IOException {
        inputStream.close();
    }

    /**
     * Input Split? ?? .
     * Input Split? ? <tt>file + ":" + start + "+" + length</tt> ? ? .
     *
     * @param inputSplit Input Split
     * @return ?
     */
    public static String getFilename(InputSplit inputSplit) {
        String filename = org.opencloudengine.flamingo.mapreduce.util.FileUtils.getFilename(inputSplit.toString());
        int start = filename.indexOf(":");
        return filename.substring(0, start);
    }

    /**
     * FileSystem? .
     *
     * @param hdfsUrl HDFS URL
     * @return FileSystem
     * @throws java.io.IOException FileSystem? ?   
     */
    public static FileSystem getFileSystem(String hdfsUrl) throws IOException {
        Configuration configuration = new Configuration();
        configuration.set("fs.default.name", hdfsUrl);
        return FileSystem.get(configuration);
    }

    /**
     *    ?.
     *
     * @param client DFS Client
     * @param path     ? 
     * @return  <tt>true</tt>
     * @throws java.io.IOException HDFS IO    
     */
    public static boolean exists(DFSClient client, String path) throws IOException {
        return client.exists(path);
    }

    /**
     *   ?? ?.
     *
     * @param client DFS Client
     * @param path   
     * @return ??  <tt>true</tt>
     * @throws java.io.IOException HDFS IO    
     */
    public static boolean isFile(DFSClient client, String path) throws IOException {
        HdfsFileStatus status = client.getFileInfo(path);
        return !status.isDir();
    }

    /**
     *   ? ?.
     *
     * @param fs   {@link org.apache.hadoop.fs.FileSystem}
     * @param path 
     * @return ?  <tt>true</tt>
     * @throws java.io.IOException HDFS IO    
     */
    public static boolean isDirectory(FileSystem fs, String path) throws IOException {
        try {
            FileStatus status = fs.getFileStatus(new Path(path));
            return status.isDir();
        } catch (FileNotFoundException ex) {
            return false;
        }
    }

    /**
     * ??  ? .
     *
     * @param client  DFS Client
     * @param path     ??  
     * @param content  ?? ? 
     * @throws java.io.IOException HDFS IO    
     */
    public static void saveFile(DFSClient client, String path, String content) throws IOException {
        OutputStream outputStream = getOutputStream(client, path, true);
        org.opencloudengine.flamingo.mapreduce.util.FileUtils.copy(content.getBytes(), outputStream);
        outputStream.close();
    }

    /**
     *  ? ?  .
     *
     * @param client DFS Client
     * @param path   ?   
     * @return ? 
     * @throws java.io.IOException HDFS IO    
     */
    public static HdfsFileStatus getFileInfo(DFSClient client, String path) throws IOException {
        return client.getFileInfo(path);
    }

    /**
     *   ? ?  ??  HDFS? .
     *
     * @param hdfsUrl          HDFS URL
     * @param filename         HDFS? Path?  ?
     * @param hdfsPath         HDFS? Path
     * @param downloadFilePath  ? ?   ?
     * @throws java.io.IOException HDFS ?  
     */
    public static void uploadToHdfs(String hdfsUrl, String filename, String hdfsPath, String downloadFilePath)
            throws IOException {
        String hdfsFullPath = hdfsPath + "/" + filename;
        File inputFile = new File(downloadFilePath);
        DFSClient dfsClient = HdfsUtils.createDFSClient(hdfsUrl);
        copyFromLocalFileToHdfsFile(inputFile, dfsClient, hdfsFullPath);
        dfsClient.close();
    }

    /**
     *  ? ? ?? HDFS .
     *
     * @param inputFile  ? ?  ?
     * @param client    DFSClient
     * @param hdfsPath  HDFS?  ? 
     * @throws java.io.IOException ??    
     */
    public static void copyFromLocalFileToHdfsFile(File inputFile, DFSClient client, String hdfsPath)
            throws IOException {
        OutputStream outputStream = HdfsUtils.getOutputStream(client, hdfsPath, true);
        InputStream inputStream = new FileInputStream(inputFile);
        org.opencloudengine.flamingo.mapreduce.util.FileUtils.copy(inputStream, outputStream);
    }

    /**
     * HDFS ??  ??   ?? ??.
     *
     * @param conf            Hadoop Configuration
     * @param path            ?? ?
     * @param prefixToAppend  ?? ??  ?? prefix?  ?
     * @param targetDirectory ? 
     * @throws java.io.IOException ?? ??   
     */
    public static void moveFileToDirectory(Configuration conf, String path, String prefixToAppend,
            String targetDirectory) throws IOException {
        FileSystem fileSystem = FileSystem.get(conf);
        FileStatus[] statuses = fileSystem.listStatus(new Path(path));
        for (FileStatus fileStatus : statuses) {
            String filename = prefixToAppend + "_" + fileStatus.getPath().getName();
            if (!isExist(conf, targetDirectory + "/" + filename)) {
                fileSystem.rename(fileStatus.getPath(), new Path(targetDirectory + "/" + filename));
            } else {
                throw new RuntimeException(
                        "\t  Warn: '" + fileStatus.getPath() + "' cannot moved. Already exists.");
            }
        }
    }

    /**
     * HDFS ??  ??   ?? ??.
     *
     * @param conf            Hadoop Configuration
     * @param delayFiles      ?? ? ?
     * @param targetDirectory ? 
     * @throws java.io.IOException ?? ??   
     */
    public static void moveFilesToDirectory(Configuration conf, List<String> delayFiles, String targetDirectory)
            throws IOException {
        for (String path : delayFiles) {
            String filename = FileUtils.getFilename(path);
            String delayedFilePrefix = filename.split("-")[0];
            String outputHead = delayedFilePrefix.replaceAll("delay", "");
            String outputMiddle = delayedFilePrefix.substring(0, 5); // todo
            String outputTail = filename.replaceAll(delayedFilePrefix, "");

            System.out.println(
                    "Acceleration Dir " + targetDirectory + "/" + outputHead + "_" + outputMiddle + outputTail);
            makeDirectoryIfNotExists(targetDirectory, conf);

            FileSystem fileSystem = FileSystem.get(conf);
            fileSystem.rename(new Path(path),
                    new Path(targetDirectory + "/" + outputHead + "_" + outputMiddle + outputTail));

            System.out.println("\t Moved: '" + path + "' --> '" + targetDirectory + "'");
        }
    }

    /**
     * HDFS ??  ??   ?? ??.
     *
     * @param conf            Hadoop Configuration
     * @param paths           ?? ? ?
     * @param prefixToAppend  ?? ??  ?? prefix?  ?
     * @param targetDirectory ? 
     * @throws java.io.IOException ?? ??   
     */
    public static void moveFilesToDirectory(Configuration conf, List<String> paths, String prefixToAppend,
            String targetDirectory) throws IOException {
        for (String file : paths) {
            try {
                HdfsUtils.moveFileToDirectory(conf, file, prefixToAppend, targetDirectory);
                System.out.println("\t Moved: '" + file + "' --> '" + targetDirectory + "'");
            } catch (Exception ex) {
                System.err.println(ex.getMessage());
            }
        }
    }

    /**
     *    ?.
     *
     * @param directory 
     * @param hdfsUrl   HDFS URL
     * @throws java.io.IOException HDFS ?  
     */
    public static void makeDirectoryIfNotExists(String directory, String hdfsUrl) throws IOException {
        Configuration conf = new Configuration();
        conf.set("fs.default.name", hdfsUrl);
        conf.set("hadoop.job.ugi", DEFAULT_UGI);
        FileSystem fileSystem = FileSystem.get(conf);
        if (!isDirectory(fileSystem, directory)) {
            logger.info("HDFS? [{}]    ?.", directory);
            fileSystem.mkdirs(new Path(directory));
        }
    }

    /**
     *  HDFS ?   ? ?? .
     *
     * @param hdfsUrl         HDFS URL
     * @param hdfsDirectories HDFS  ?
     * @return HDFS ? ??   ? ?
     * @throws java.io.IOException HDFS?   , ? ??    
     */
    public static String[] getHdfsFiles(String hdfsUrl, List<String> hdfsDirectories) throws IOException {
        List<String> filesInDirectories = new ArrayList<String>();
        FileSystem fs = HdfsUtils.getFileSystem(hdfsUrl);
        for (String hdfsDirectory : hdfsDirectories) {
            List<String> files = HdfsUtils.listFiles(fs, hdfsDirectory);
            filesInDirectories.addAll(files);
        }
        return StringUtils.toStringArray(filesInDirectories);
    }

    /**
     * HDFS?  ?  ??  ?  ? ?? .
     *
     * @param hdfsUrl HDFS URL
     * @param ext     ?(: <tt>.dat</tt>)
     * @param path    
     * @return "<tt>.dat</tt>" ?  ? ?
     * @throws java.io.IOException HDFS ?  
     */
    public static String[] getHdfsFiles(String hdfsUrl, String ext, String path) throws IOException {
        ArrayList<String> files = new ArrayList<String>();
        DFSClient client = HdfsUtils.createDFSClient(hdfsUrl);
        makeDirectoryIfNotExists(path, hdfsUrl);
        client.close();
        return StringUtils.toStringArray(files);
    }

    /**
     *  ? ??  ?.
     *
     * @param hdfsUrl HDFS URL
     * @param path      ?  
     * @return  <tt>true</tt>
     * @throws java.io.IOException ?     , HDFS?    
     */
    public static boolean isExist(String hdfsUrl, String path) throws IOException {
        DFSClient client = HdfsUtils.createDFSClient(hdfsUrl);
        HdfsFileStatus status = client.getFileInfo(path);
        if (status != null && !status.isDir()) {
            client.close();
            return true;
        }
        client.close();
        return false;
    }

    /**
     *    ?.
     *
     * @param directory 
     * @param conf      Hadoop Configuration
     * @throws java.io.IOException HDFS ?  
     */
    public static void makeDirectoryIfNotExists(String directory, Configuration conf) throws IOException {
        FileSystem fileSystem = FileSystem.get(conf);
        if (!isExist(conf, directory) && !isDirectory(fileSystem, directory)) {
            fileSystem.mkdirs(new Path(directory));
        }
    }

    /**
     *  ? ??  ?.
     *
     * @param path   ?  
     * @return  <tt>true</tt>
     * @throws java.io.IOException ?     , HDFS?    
     */
    public static boolean isExist(String path) throws IOException {
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        FileStatus status = fs.getFileStatus(new Path(path));
        return status != null;
    }

    /**
     *  ? ??  ?.
     *
     * @param conf Haodop Job Configuration
     * @param path   ?  
     * @return  <tt>true</tt>
     * @throws java.io.IOException ?     , HDFS?    
     */
    public static boolean isExist(Configuration conf, String path) throws IOException {
        FileSystem fs = FileSystem.get(conf);
        return fs.exists(new Path(path));
    }

    /**
     * HDFS?  ?  ?? .
     *
     * @param hdfsUrl       HDFS URL
     * @param hdfsDirectory ??  HDFS Directory URL
     * @throws java.io.IOException ??    
     */
    public static void deleteFromHdfs(String hdfsUrl, String hdfsDirectory) throws IOException {
        Configuration conf = new Configuration();
        conf.set("fs.default.name", hdfsUrl);
        FileSystem fs = FileSystem.get(conf);
        FileStatus[] statuses = fs.globStatus(new Path(hdfsDirectory));
        for (int i = 0; i < statuses.length; i++) {
            FileStatus fileStatus = statuses[i];
            fs.delete(fileStatus.getPath(), true);
        }
    }

    /**
     * HDFS?  ?  ?? .
     *
     * @param hdfsDirectory ??  HDFS Directory URL
     * @throws java.io.IOException ??    
     */
    public static void deleteFromHdfs(String hdfsDirectory) throws IOException {
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        FileStatus[] statuses = fs.globStatus(new Path(hdfsDirectory));
        for (int i = 0; i < statuses.length; i++) {
            FileStatus fileStatus = statuses[i];
            fs.delete(fileStatus.getPath(), true);
        }
    }

    /**
     *  ?  ?? MERGE.
     *
     * @param hdfsUrl HDFS URL
     * @param path    HDFS Path
     * @throws java.io.IOException Get Merge   
     */
    public static void merge(String hdfsUrl, String path) throws IOException {
        //  ?  ?? Get Merge  ?? ?.
        Configuration conf = new Configuration();
        conf.set("fs.default.name", hdfsUrl);
        FileSystem fileSystem = FileSystem.get(conf);
        Path source = new Path(path);
        if (!fileSystem.getFileStatus(source).isDir()) {
            // ? ??? ??? Get Merge .
            return;
        }
        Path target = new Path(path + "_temporary");
        FileUtil.copyMerge(fileSystem, source, fileSystem, target, true, conf, null);

        // ?  ?? .
        fileSystem.delete(source, true);

        //  ?? ?  ? .
        Path in = new Path(path + "_temporary");
        Path out = new Path(path);
        fileSystem.rename(in, out);

        //   .
        fileSystem.delete(new Path(path + "_temporary"), true);
    }

    /**
     * HDFS?  ?  ??   ?  .
     *
     * @param conf       Hadoop Configuration
     * @param path       
     * @param pathFilter ?? ? 
     * @return   ?
     * @throws java.io.IOException HDFS ?  
     */
    public static String getLatestFile(Configuration conf, String path, PathFilter pathFilter) throws IOException {
        List<SortableFileStatus> files = new ArrayList<SortableFileStatus>();
        FileSystem fs = FileSystem.get(conf);
        FileStatus[] statuses = fs.listStatus(new Path(path),
                pathFilter != null ? pathFilter : new BypassPathFilter());
        if (statuses != null) {
            for (FileStatus fileStatus : statuses) {
                if (!fileStatus.isDir()) {
                    files.add(new SortableFileStatus(fileStatus));
                }
            }
        }
        Collections.sort(files);
        FileStatus fileStatus = files.get(0).fileStatus;
        return fileStatus.getPath().toUri().getPath();
    }

    /**
     *   .
     *
     * @param configuration Hadoop Configuration
     * @param path           
     * @throws java.io.IOException    
     */
    public static void delete(Configuration configuration, String path) throws IOException {
        FileSystem fileSystem = FileSystem.get(configuration);
        Path source = new Path(path);
        fileSystem.delete(source, true);
    }

    /**
     * HDFS?  ?  ?? prefix  ? ?? .
     *
     * @param conf       Configuration
     * @param path       
     * @param prefix     ?? Prefix
     * @param pathFilter ?? ? 
     * @return  prefix ??  ? ?
     * @throws java.io.IOException HDFS ?  
     */
    public static List<String> getPrefixFiles(Configuration conf, String path, String prefix, PathFilter pathFilter)
            throws IOException {
        List<String> files = new ArrayList<String>();
        FileSystem fs = FileSystem.get(conf);
        FileStatus[] statuses = fs.listStatus(new Path(path),
                pathFilter != null ? pathFilter : new BypassPathFilter());
        if (statuses != null) {
            for (FileStatus fileStatus : statuses) {
                if (!fileStatus.isDir() && fileStatus.getPath().getName().startsWith(prefix)) {
                    files.add(fileStatus.getPath().toUri().getPath());
                }
            }
        }
        return files;
    }
}