org.apache.crunch.io.impl.FileTargetImpl.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.crunch.io.impl.FileTargetImpl.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.crunch.io.impl;

import java.io.IOException;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.google.common.collect.ImmutableMap;
import org.apache.commons.lang.builder.HashCodeBuilder;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.crunch.CrunchRuntimeException;
import org.apache.crunch.SourceTarget;
import org.apache.crunch.Target;
import org.apache.crunch.impl.mr.plan.PlanningParameters;
import org.apache.crunch.io.CrunchOutputs;
import org.apache.crunch.io.FileNamingScheme;
import org.apache.crunch.io.FormatBundle;
import org.apache.crunch.io.OutputHandler;
import org.apache.crunch.io.PathTarget;
import org.apache.crunch.io.SourceTargetHelper;
import org.apache.crunch.types.Converter;
import org.apache.crunch.types.PType;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class FileTargetImpl implements PathTarget {

    private static final Log LOG = LogFactory.getLog(FileTargetImpl.class);

    protected final Path path;
    private final FormatBundle<? extends FileOutputFormat> formatBundle;
    private final FileNamingScheme fileNamingScheme;

    public FileTargetImpl(Path path, Class<? extends FileOutputFormat> outputFormatClass,
            FileNamingScheme fileNamingScheme) {
        this(path, outputFormatClass, fileNamingScheme, ImmutableMap.<String, String>of());
    }

    public FileTargetImpl(Path path, Class<? extends FileOutputFormat> outputFormatClass,
            FileNamingScheme fileNamingScheme, Map<String, String> extraConf) {
        this.path = path;
        this.formatBundle = FormatBundle.forOutput(outputFormatClass);
        this.fileNamingScheme = fileNamingScheme;
        if (extraConf != null && !extraConf.isEmpty()) {
            for (Map.Entry<String, String> e : extraConf.entrySet()) {
                formatBundle.set(e.getKey(), e.getValue());
            }
        }
    }

    @Override
    public Target outputConf(String key, String value) {
        formatBundle.set(key, value);
        return this;
    }

    @Override
    public void configureForMapReduce(Job job, PType<?> ptype, Path outputPath, String name) {
        Converter converter = ptype.getConverter();
        Class keyClass = converter.getKeyClass();
        Class valueClass = converter.getValueClass();
        configureForMapReduce(job, keyClass, valueClass, formatBundle, outputPath, name);
    }

    @Deprecated
    protected void configureForMapReduce(Job job, Class keyClass, Class valueClass, Class outputFormatClass,
            Path outputPath, String name) {
        configureForMapReduce(job, keyClass, valueClass, FormatBundle.forOutput(outputFormatClass), outputPath,
                name);
    }

    protected void configureForMapReduce(Job job, Class keyClass, Class valueClass, FormatBundle formatBundle,
            Path outputPath, String name) {
        try {
            FileOutputFormat.setOutputPath(job, outputPath);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
        if (name == null) {
            job.setOutputFormatClass(formatBundle.getFormatClass());
            formatBundle.configure(job.getConfiguration());
            job.setOutputKeyClass(keyClass);
            job.setOutputValueClass(valueClass);
        } else {
            CrunchOutputs.addNamedOutput(job, name, formatBundle, keyClass, valueClass);
        }
    }

    @Override
    public boolean accept(OutputHandler handler, PType<?> ptype) {
        handler.configure(this, ptype);
        return true;
    }

    @Override
    public Converter<?, ?, ?, ?> getConverter(PType<?> ptype) {
        return ptype.getConverter();
    }

    @Override
    public void handleOutputs(Configuration conf, Path workingPath, int index) throws IOException {
        FileSystem srcFs = workingPath.getFileSystem(conf);
        Path src = getSourcePattern(workingPath, index);
        Path[] srcs = FileUtil.stat2Paths(srcFs.globStatus(src), src);
        FileSystem dstFs = path.getFileSystem(conf);
        if (!dstFs.exists(path)) {
            dstFs.mkdirs(path);
        }
        boolean sameFs = isCompatible(srcFs, path);
        for (Path s : srcs) {
            Path d = getDestFile(conf, s, path, s.getName().contains("-m-"));
            if (sameFs) {
                srcFs.rename(s, d);
            } else {
                FileUtil.copy(srcFs, s, dstFs, d, true, true, conf);
            }
        }
        dstFs.create(getSuccessIndicator(), true).close();
    }

    private Path getSuccessIndicator() {
        return new Path(path, "_SUCCESS");
    }

    protected Path getSourcePattern(Path workingPath, int index) {
        return new Path(workingPath, PlanningParameters.MULTI_OUTPUT_PREFIX + index + "-*");
    }

    @Override
    public Path getPath() {
        return path;
    }

    protected static boolean isCompatible(FileSystem fs, Path path) {
        try {
            fs.makeQualified(path);
            return true;
        } catch (IllegalArgumentException e) {
            return false;
        }
    }

    protected Path getDestFile(Configuration conf, Path src, Path dir, boolean mapOnlyJob) throws IOException {
        String outputFilename = null;
        String sourceFilename = src.getName();
        if (mapOnlyJob) {
            outputFilename = getFileNamingScheme().getMapOutputName(conf, dir);
        } else {
            outputFilename = getFileNamingScheme().getReduceOutputName(conf, dir,
                    extractPartitionNumber(sourceFilename));
        }
        if (sourceFilename.contains(".")) {
            outputFilename += sourceFilename.substring(sourceFilename.indexOf("."));
        }
        return new Path(dir, outputFilename);
    }

    /**
     * Extract the partition number from a raw reducer output filename.
     *
     * @param reduceOutputFileName The raw reducer output file name
     * @return The partition number encoded in the filename
     */
    public static int extractPartitionNumber(String reduceOutputFileName) {
        Matcher matcher = Pattern.compile(".*-r-(\\d{5})").matcher(reduceOutputFileName);
        if (matcher.find()) {
            return Integer.parseInt(matcher.group(1), 10);
        } else {
            throw new IllegalArgumentException(
                    "Reducer output name '" + reduceOutputFileName + "' cannot be parsed");
        }
    }

    @Override
    public FileNamingScheme getFileNamingScheme() {
        return fileNamingScheme;
    }

    @Override
    public boolean equals(Object other) {
        if (other == null || !getClass().equals(other.getClass())) {
            return false;
        }
        FileTargetImpl o = (FileTargetImpl) other;
        return path.equals(o.path);
    }

    @Override
    public int hashCode() {
        return new HashCodeBuilder().append(path).toHashCode();
    }

    @Override
    public String toString() {
        return new StringBuilder().append(formatBundle.getFormatClass().getSimpleName()).append("(").append(path)
                .append(")").toString();
    }

    @Override
    public <T> SourceTarget<T> asSourceTarget(PType<T> ptype) {
        // By default, assume that we cannot do this.
        return null;
    }

    @Override
    public boolean handleExisting(WriteMode strategy, long lastModForSource, Configuration conf) {
        FileSystem fs = null;
        try {
            fs = path.getFileSystem(conf);
        } catch (IOException e) {
            LOG.error("Could not retrieve FileSystem object to check for existing path", e);
            throw new CrunchRuntimeException(e);
        }

        boolean exists = false;
        boolean successful = false;
        long lastModForTarget = -1;
        try {
            exists = fs.exists(path);
            if (exists) {
                successful = fs.exists(getSuccessIndicator());
                lastModForTarget = SourceTargetHelper.getLastModifiedAt(fs, path);
            }
        } catch (IOException e) {
            LOG.error("Exception checking existence of path: " + path, e);
            throw new CrunchRuntimeException(e);
        }

        if (exists) {
            switch (strategy) {
            case DEFAULT:
                LOG.error("Path " + path + " already exists!");
                throw new CrunchRuntimeException("Path already exists: " + path);
            case OVERWRITE:
                LOG.info("Removing data at existing path: " + path);
                try {
                    fs.delete(path, true);
                } catch (IOException e) {
                    LOG.error("Exception thrown removing data at path: " + path, e);
                }
                break;
            case APPEND:
                LOG.info("Adding output files to existing path: " + path);
                break;
            case CHECKPOINT:
                if (successful && lastModForTarget > lastModForSource) {
                    LOG.info("Re-starting pipeline from checkpoint path: " + path);
                    break;
                } else {
                    if (!successful) {
                        LOG.info("_SUCCESS file not found, Removing data at existing checkpoint path: " + path);
                    } else {
                        LOG.info("Source data has recent updates. Removing data at existing checkpoint path: "
                                + path);
                    }
                    try {
                        fs.delete(path, true);
                    } catch (IOException e) {
                        LOG.error("Exception thrown removing data at checkpoint path: " + path, e);
                    }
                    return false;
                }
            default:
                throw new CrunchRuntimeException("Unknown WriteMode:  " + strategy);
            }
        } else {
            LOG.info("Will write output files to new path: " + path);
        }
        return exists;
    }

}