com.datasalt.utils.commons.HadoopUtils.java Source code

Java tutorial

Introduction

Here is the source code for com.datasalt.utils.commons.HadoopUtils.java

Source

/**
 * Copyright [2011] [Datasalt Systems S.L.]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.datasalt.utils.commons;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.util.HashMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.TaskInputOutputContext;

/**
 * <p>
 * Put here usefull methods for manipulating things in the HDFS, etc.
 * </p>
 * 
 * @author pere
 * 
 */
public class HadoopUtils {

    public static void deleteIfExists(FileSystem dFs, Path path) throws IOException {
        if (dFs.exists(path)) {
            dFs.delete(path, true);
        }
    }

    public static void synchronize(FileSystem fS1, Path p1, FileSystem fS2, Path p2) throws IOException {
        deleteIfExists(fS2, p2);
        FileUtil.copy(fS1, p1, fS2, p2, false, false, fS1.getConf());
    }

    /**
     * Creates a file with the given string, overwritting if needed.
     */
    public static void stringToFile(FileSystem fs, Path path, String string) throws IOException {
        OutputStream os = fs.create(path, true);
        PrintWriter pw = new PrintWriter(os);
        pw.append(string);
        pw.close();
    }

    /**
     * Reads the content of a file into a String. Return null if the file does not exist.
     */
    public static String fileToString(FileSystem fs, Path path) throws IOException {
        if (!fs.exists(path)) {
            return null;
        }

        InputStream is = fs.open(path);
        InputStreamReader isr = new InputStreamReader(is);
        BufferedReader br = new BufferedReader(isr);
        char[] buff = new char[256];
        StringBuilder sb = new StringBuilder();
        int read;
        while ((read = br.read(buff)) != -1) {
            sb.append(buff, 0, read);
        }
        br.close();
        return sb.toString();
    }

    /**
     * Reads maps of integer -> double
     */
    public static HashMap<Integer, Double> readIntDoubleMap(Path path, FileSystem fs) throws IOException {
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());

        IntWritable topic = new IntWritable();
        DoubleWritable value = new DoubleWritable();

        HashMap<Integer, Double> ret = new HashMap<Integer, Double>();

        while (reader.next(topic)) {
            reader.getCurrentValue(value);

            ret.put(topic.get(), value.get());
        }

        reader.close();
        return ret;
    }

    /**
     * Reads maps of integer -> double from glob paths like "folder/part-r*"
     */
    public static HashMap<Integer, Double> readIntDoubleMapFromGlob(Path glob, FileSystem fs) throws IOException {
        FileStatus status[] = fs.globStatus(glob);
        HashMap<Integer, Double> ret = new HashMap<Integer, Double>();
        for (FileStatus fileS : status) {
            ret.putAll(readIntDoubleMap(fileS.getPath(), fs));
        }
        return ret;
    }

    /**
     * Reads maps of integer -> integer
     */
    public static HashMap<Integer, Integer> readIntIntMap(Path path, FileSystem fs) throws IOException {
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());

        IntWritable topic = new IntWritable();
        IntWritable value = new IntWritable();

        HashMap<Integer, Integer> ret = new HashMap<Integer, Integer>();

        while (reader.next(topic)) {
            reader.getCurrentValue(value);

            ret.put(topic.get(), value.get());
        }

        reader.close();
        return ret;
    }

    /**
     * Reads maps of integer -> integer from glob paths like "folder/part-r*"
     */
    public static HashMap<Integer, Integer> readIntIntMapFromGlob(Path glob, FileSystem fs) throws IOException {
        FileStatus status[] = fs.globStatus(glob);
        HashMap<Integer, Integer> ret = new HashMap<Integer, Integer>();
        for (FileStatus fileS : status) {
            ret.putAll(readIntIntMap(fileS.getPath(), fs));
        }
        return ret;
    }

    /**
     * Utility for doing ctx.getCounter(groupName, counter.toString()).increment(1);
     */
    @SuppressWarnings("rawtypes")
    public static void incCounter(TaskInputOutputContext ctx, String groupName, Enum counter) {
        ctx.getCounter(groupName, counter.toString()).increment(1);
    }

    /**
     * Given a file post-fix, locate a file in the DistributedCache
     * 
     * @param conf
     * @param filePostFix
     * 
     * @throws IOException
     */
    public static Path locateFileInDC(Configuration conf, String filePostFix) throws IOException {
        Path locatedFile = null;
        Path[] paths = DistributedCache.getLocalCacheFiles(conf);
        if (paths == null) {
            return null;
        }
        for (Path p : paths) {
            if (p.toString().endsWith(filePostFix)) {
                locatedFile = p;
                break;
            }
        }
        return locatedFile;
    }
}