com.ibm.jaql.io.hadoop.FileOutputConfigurator.java Source code

Introduction

Here is the source code for com.ibm.jaql.io.hadoop.FileOutputConfigurator.java
Source

/*
 * Copyright (C) IBM Corp. 2008.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.ibm.jaql.io.hadoop;

import java.io.IOException;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.mapred.FileOutputCommitter;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCommitter;

import com.ibm.jaql.io.AdapterStore;
import com.ibm.jaql.json.type.JsonRecord;
import com.ibm.jaql.json.type.JsonValue;

/**
 * A Configurator that specifically writes the JobConf for OutputFormat
 */
public class FileOutputConfigurator implements InitializableConfSetter {
    protected String location;

    /*
     * (non-Javadoc)
     * 
     * @see com.ibm.jaql.io.hadoop.ConfSetter#init(java.lang.Object)
     */
    public void init(JsonValue options) throws Exception {
        location = AdapterStore.getStore().getLocation((JsonRecord) options);
    }

    /*
     * (non-Javadoc)
     * 
     * @see com.ibm.jaql.io.hadoop.ConfSetter#setSequential(org.apache.hadoop.mapred.JobConf)
     */
    public void setSequential(JobConf conf) throws Exception {
        registerSerializers(conf);

        // For an expression, the location is the final file name
        Path outPath = new Path(location);
        FileSystem fs = outPath.getFileSystem(conf);
        outPath = outPath.makeQualified(fs);
        if (fs.exists(outPath)) {
            // TODO: Jaql currently has overwrite semantics; add flag to control this
            if (fs.isFile(outPath)) {
                fs.delete(outPath, false);
            } else {
                // Look for a map-reduce output directory
                FileStatus[] nonMR = fs.listStatus(outPath, new PathFilter() {
                    boolean onlyOne = true;

                    public boolean accept(Path path) {
                        String name = path.getName();
                        if (name.matches("([.][.]?)|([.]part-[0-9]+.crc)|(part-[0-9]+)")) {
                            return false;
                        }
                        if (onlyOne) {
                            onlyOne = false;
                            return true;
                        }
                        return false;
                    }
                });
                if (nonMR.length > 0) {
                    throw new IOException(
                            "directory exists and is not a map-reduce output directory: " + nonMR[0].getPath());
                }
                fs.delete(outPath, true);
            }
        }

        // In sequential mode, we will write directly to the output file
        // and bypass the _temporary directory and rename of the standard 
        // FileOutputCommitter by using our own DirectFileOutputCommitter.
        FileOutputFormat.setOutputPath(conf, outPath.getParent());
        conf.setClass("mapred.output.committer.class", DirectFileOutputCommiter.class, OutputCommitter.class);
    }

    protected void registerSerializers(JobConf conf) {
        conf.setOutputKeyClass(JsonHolderDefault.class);
        conf.setOutputValueClass(JsonHolderDefault.class);
        HadoopSerializationDefault.register(conf);
    }

    /*
     * (non-Javadoc)
     * 
     * @see com.ibm.jaql.io.hadoop.ConfSetter#setParallel(org.apache.hadoop.mapred.JobConf)
     */
    public void setParallel(JobConf conf) throws Exception {
        registerSerializers(conf);

        // For map-reduce, multiple files can be produced, so the location is their
        // parent directory.
        Path outPath = new Path(location);
        FileSystem fs = outPath.getFileSystem(conf);
        fs.delete(outPath, true); // TODO: Jaql currently has overwrite semantics; add flag to control this
        FileOutputFormat.setOutputPath(conf, outPath);
    }
}