com.ibm.jaql.io.hadoop.CompositeInputAdapter.java Source code

Introduction

Here is the source code for com.ibm.jaql.io.hadoop.CompositeInputAdapter.java
Source

/*
 * Copyright (C) IBM Corp. 2008.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.ibm.jaql.io.hadoop;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;

import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.ReflectionUtils;

import com.ibm.jaql.io.Adapter;
import com.ibm.jaql.io.AdapterStore;
import com.ibm.jaql.io.ClosableJsonIterator;
import com.ibm.jaql.io.registry.RegistryUtil;
import com.ibm.jaql.json.schema.ArraySchema;
import com.ibm.jaql.json.schema.OrSchema;
import com.ibm.jaql.json.schema.Schema;
import com.ibm.jaql.json.schema.SchemaFactory;
import com.ibm.jaql.json.type.BufferedJsonArray;
import com.ibm.jaql.json.type.JsonArray;
import com.ibm.jaql.json.type.JsonLong;
import com.ibm.jaql.json.type.JsonRecord;
import com.ibm.jaql.json.type.JsonValue;
import com.ibm.jaql.json.type.MutableJsonLong;

// TODO: look into factoring some of this code with DefaultHadoopInputAdapter
/** Takes an array of HadoopInputAdapters and operates on the union of their inputs. */
public class CompositeInputAdapter implements HadoopInputAdapter {
    public static String CURRENT_IDX_NAME = "com.ibm.jaql.lang.CompositeinputAdapter.currentIdx";
    public static String ADD_INDEX_NAME = "com.ibm.jaql.lang.CompositeinputAdapter.addIndex";

    private JsonArray args;

    // true => return [inputIndex,value] pairs, else just value.
    // This is triggered by using a record for a descriptor; an array will not add the index.
    // TODO: This is a bit of an ugly way to do this, but it keeps mapReduce working until
    // we decide if we should eliminate direct co-group support from mapReduce()
    private boolean addIndex;

    private HadoopInputAdapter[] adapters;

    /*
     * (non-Javadoc)
     * 
     * @see com.ibm.jaql.io.Adapter#initializeFrom(com.ibm.jaql.json.type.Item)
     */
    @Override
    public void init(JsonValue val) throws Exception {
        // TODO: eliminate
        if (val instanceof JsonArray) {
            addIndex = false;
            initializeFrom((JsonArray) val);
        } else if (val instanceof JsonRecord) {
            addIndex = true;
            // dig the location out
            JsonRecord rval = (JsonRecord) val;
            JsonArray loc = (JsonArray) rval.get(CompositeOutputAdapter.DESCRIPTORS);
            if (loc == null) {
                loc = (JsonArray) rval.get(Adapter.LOCATION_NAME); // TODO: eliminate
            }
            initializeFrom(loc);
        } else {
            throw new IllegalArgumentException("invalid composite descriptor");
        }
    }

    /**
     * @param args
     * @throws Exception
     */
    private void initializeFrom(JsonArray args) throws Exception {
        this.args = (JsonArray) args;

        // 1. make a InputAdapter array of the same size as args
        int numAdapters = (int) this.args.count();
        adapters = new HadoopInputAdapter[numAdapters];

        // 2. instantiate and initialize all StorableInputAdapters
        for (int i = 0; i < numAdapters; i++) {
            JsonValue value = this.args.get(i);
            // adapters[i] = AdapterStore.getInputAdapter((JRecord) item.getNonNull(),
            // item);
            adapters[i] = (HadoopInputAdapter) AdapterStore.getStore().input.getAdapter(value);
        }
    }

    /*
     * (non-Javadoc)
     * 
     * @see com.ibm.jaql.io.Adapter#open()
     */
    public void open() throws Exception {
        // for each adapter, call its open
        int numAdapters = adapters.length;
        for (int i = 0; i < numAdapters; i++) {
            adapters[i].open();
        }
    }

    /*
     * (non-Javadoc)
     * 
     * @see com.ibm.jaql.io.InputAdapter#getItemReader()
     */
    public ClosableJsonIterator iter() throws Exception {
        if (addIndex) {
            final MutableJsonLong index = new MutableJsonLong(0);
            final BufferedJsonArray pair = new BufferedJsonArray(2);
            pair.set(0, index);

            return new ClosableJsonIterator(pair) { // TODO: temprory hack until input get updated
                ClosableJsonIterator baseReader = null;

                int idx = 0;

                @Override
                public boolean moveNext() throws Exception {
                    while (true) {
                        if (baseReader == null) {
                            if (idx >= adapters.length) {
                                return false;
                            }
                            index.set(idx);
                            baseReader = adapters[idx++].iter();
                        }
                        if (baseReader.moveNext()) {
                            pair.set(1, baseReader.current());
                            return true;
                        } else {
                            baseReader.close();
                            baseReader = null;
                        }
                    }
                }
            };
        } else {
            return new ClosableJsonIterator() { // TODO: temprory hack until input get updated
                ClosableJsonIterator baseReader = null;

                int idx = 0;

                @Override
                public boolean moveNext() throws Exception {
                    while (true) {
                        if (baseReader == null) {
                            if (idx >= adapters.length) {
                                return false;
                            }
                            baseReader = adapters[idx++].iter();
                        }
                        if (baseReader.moveNext()) {
                            currentValue = baseReader.current();
                            return true;
                        } else {
                            baseReader.close();
                            baseReader = null;
                        }
                    }
                }
            };
        }
    }

    public Schema getSchema() {
        Schema[] inSchemata = new Schema[adapters.length];
        for (int i = 0; i < adapters.length; i++) {
            inSchemata[i] = adapters[i].getSchema();
        }
        Schema schema = OrSchema.make(inSchemata);
        if (addIndex) {
            schema = schema.elements();
            schema = new ArraySchema(new Schema[] { SchemaFactory.longSchema(), schema });
            schema = new ArraySchema(null, schema);
        }
        return schema;
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.apache.hadoop.mapred.InputFormat#getRecordReader(org.apache.hadoop.mapred.InputSplit,
     *      org.apache.hadoop.mapred.JobConf, org.apache.hadoop.mapred.Reporter)
     */
    @SuppressWarnings("unchecked")
    public RecordReader<JsonHolder, JsonHolder> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
            throws IOException {
        CompositeSplit cSplit = (CompositeSplit) split;

        // 1. get the InputAdapter's array index (i) from the split
        final int idx = cSplit.getAdapterIdx();
        InputSplit baseSplit = cSplit.getSplit();

        try {
            // 2. get the ith adapter's args record
            JsonValue value = this.args.get(idx);
            // JRecord baseArgs = (JRecord) item.getNonNull();
            // record the current index to the job conf
            // ASSUMES: in map/reduce, the format's record reader is called *before*
            // the map class is configured
            writeCurrentIndex(job, idx); // FIXME: no longer needed

            // 3. insantiate and initialize the adapter
            HadoopInputAdapter adapter = (HadoopInputAdapter) AdapterStore.getStore().input
                    .getAdapter(/** baseArgs, */
                            value);

            // 4. create a new JobConf j'
            JobConf jTmp = new JobConf(job);

            // 5. call adapter's setupConf(j')
            // ConfiguratorUtil.writeToConf(adapter, jTmp, item/**baseArgs*/);
            adapter.setParallel(jTmp);

            // 6. configure the adapter from j'
            adapter.configure(jTmp);

            // 7. call adapter's getRecordReader with j'
            final RecordReader<JsonHolder, JsonHolder> reader = (RecordReader<JsonHolder, JsonHolder>) adapter
                    .getRecordReader(baseSplit, jTmp, reporter);

            if (!addIndex) {
                return reader;
            }

            return new RecordReader<JsonHolder, JsonHolder>() {

                @Override
                public void close() throws IOException {
                    reader.close();
                }

                @Override
                public JsonHolder createKey() {
                    return reader.createKey();
                }

                @Override
                public JsonHolder createValue() {
                    return reader.createValue();
                }

                @Override
                public long getPos() throws IOException {
                    return reader.getPos();
                }

                @Override
                public float getProgress() throws IOException {
                    return reader.getProgress();
                }

                @Override
                public boolean next(JsonHolder key, JsonHolder value) throws IOException {
                    BufferedJsonArray pair = (BufferedJsonArray) value.value;
                    if (pair != null) {
                        value.value = pair.get(1);
                    } else {
                        pair = new BufferedJsonArray(2);
                        pair.set(0, JsonLong.make(idx));
                    }

                    if (reader.next(key, value)) {
                        pair.set(1, value.value);
                        value.value = pair;
                        return true;
                    }

                    return false;
                }
            };

        } catch (Exception e) {
            return null;
        }
    }

    /*
     * (non-Javadoc)
     * 
     * @see com.ibm.jaql.io.Adapter#close()
     */
    public void close() throws Exception {
        // for each adapter, call its close
        int numAdapters = adapters.length;
        for (int i = 0; i < numAdapters; i++) {
            adapters[i].close();
        }
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.apache.hadoop.mapred.InputFormat#getSplits(org.apache.hadoop.mapred.JobConf,
     *      int)
     */
    public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
        // initialize adapters
        try {
            initializeFrom(this.args);
        } catch (Exception e) {
            throw new IOException(e.getMessage());
        }

        // for each adapter
        int numAdapters = adapters.length;
        ArrayList<CompositeSplit> allSplits = new ArrayList<CompositeSplit>();
        for (int i = 0; i < numAdapters; i++) {
            JobConf jTmp = new JobConf(job);
            try {
                // ConfiguratorUtil.writeToConf((Configurator)adapters[i], jTmp,
                // (JRecord)this.args.nth(i).getNonNull());
                // ConfiguratorUtil.writeToConf((ConfSetter)adapters[i], jTmp,
                // this.args.nth(i));
                adapters[i].setParallel(jTmp);
            } catch (Exception e) {
                throw new IOException(e.getMessage());
            }
            // TODO: is this needed?
            // ((DefaultHadoopInputAdapter)adapters[i]).configure(jTmp);
            adapters[i].configure(jTmp);

            // get its splits
            InputSplit[] splits = ((InputFormat<?, ?>) adapters[i]).getSplits(jTmp, numSplits);
            for (int j = 0; j < splits.length; j++) {
                // wrap the split with InputAdapter's array index
                allSplits.add(new CompositeSplit(splits[j], i)); // FIXME: memory
            }
        }
        return allSplits.toArray(new InputSplit[allSplits.size()]);
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.apache.hadoop.mapred.InputFormat#validateInput(org.apache.hadoop.mapred.JobConf)
     */
    @Deprecated
    public void validateInput(JobConf job) throws IOException {
        // 1. read the args array and parse it
        try {
            this.args = ConfUtil.readConfArray(job, ConfSetter.CONFINOPTIONS_NAME);

            // instantiate and initialize it
            this.initializeFrom(this.args);

            // 2. for each adapter
            int numAdapters = adapters.length;
            for (int i = 0; i < numAdapters; i++) {
                // make a new JobConf j'
                JobConf jTmp = new JobConf(job);

                // call adapter's setupConf(j')
                // ConfiguratorUtil.writeToConf((ConfSetter)adapters[i], jTmp,
                // this.args.nth(i));
                adapters[i].setParallel(jTmp);
                // ConfiguratorUtil.writeToConf((Configurator)adapters[i], jTmp,
                // (JRecord)this.args.nth(i).getNonNull());

                // call adapter's validateInput(j')
                //        ((InputFormat<?,?>) adapters[i]).validateInput(jTmp);
            }
        } catch (Exception e) {
            throw new IOException(e.getMessage());
        }
    }

    /*
     * (non-Javadoc)
     * 
     * @see com.ibm.jaql.io.hadoop.ConfSetter#setSequential(org.apache.hadoop.mapred.JobConf)
     */
    public void setSequential(JobConf conf) throws Exception {
        set(conf);
    }

    /*
     * (non-Javadoc)
     * 
     * @see com.ibm.jaql.io.hadoop.ConfSetter#setParallel(org.apache.hadoop.mapred.JobConf)
     */
    public void setParallel(JobConf conf) throws Exception {
        set(conf);
    }

    // FIXME: the "args" here makes no sense... should be JArray
    /**
     * @param conf
     * @throws Exception
     */
    protected void set(JobConf conf) throws Exception {
        conf.setInputFormat(this.getClass());

        // write out the input adapter args array
        ConfUtil.writeConfArray(conf, ConfSetter.CONFINOPTIONS_NAME, this.args);
        conf.set(ADD_INDEX_NAME, Boolean.toString(addIndex));
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.apache.hadoop.mapred.JobConfigurable#configure(org.apache.hadoop.mapred.JobConf)
     */
    public void configure(JobConf conf) {
        Globals.setJobConf(conf);
        try {
            RegistryUtil.readConf(conf, HadoopAdapter.storeRegistryVarName, AdapterStore.getStore());
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
        // read in the adapter array from conf
        try {
            this.args = ConfUtil.readConfArray(conf, ConfSetter.CONFINOPTIONS_NAME);
            this.addIndex = Boolean.parseBoolean(conf.get(ADD_INDEX_NAME));
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * @param job
     * @param idx
     * @throws Exception
     */
    private void writeCurrentIndex(JobConf job, int idx) throws Exception {
        job.set(CURRENT_IDX_NAME, String.valueOf(idx));
    }

    /**
     * @param job
     * @return
     */
    public static int readCurrentIndex(JobConf job) {
        String v = job.get(CURRENT_IDX_NAME, "0");
        return Integer.parseInt(v);
    }
}

// TODO: make this a static inner class
/**
 * 
 */
class CompositeSplit implements InputSplit {
    private InputSplit baseSplit;

    private int adapterIdx;

    /**
     * 
     */
    public CompositeSplit() {
    }

    /**
     * @param split
     * @param idx
     */
    public CompositeSplit(InputSplit split, int idx) {
        this.baseSplit = split;
        this.adapterIdx = idx;
    }

    /**
     * @return
     */
    public int getAdapterIdx() {
        return adapterIdx;
    }

    /**
     * @return
     */
    public InputSplit getSplit() {
        return baseSplit;
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.apache.hadoop.mapred.InputSplit#getLength()
     */
    public long getLength() throws IOException {
        return baseSplit.getLength();
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.apache.hadoop.mapred.InputSplit#getLocations()
     */
    public String[] getLocations() throws IOException {
        return baseSplit.getLocations();
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
     */
    public void readFields(DataInput in) throws IOException {
        this.adapterIdx = in.readInt();
        String cName = in.readUTF();
        try {
            Class<?> c = Class.forName(cName).asSubclass(InputSplit.class);
            ;
            this.baseSplit = (InputSplit) ReflectionUtils.newInstance(c, null);
            this.baseSplit.readFields(in);
        } catch (ClassNotFoundException ce) {
            throw new IOException(ce.getMessage());
        }
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
     */
    public void write(DataOutput out) throws IOException {
        out.writeInt(adapterIdx);
        // WARNING: getCanonicalName may not work for inner classes.
        out.writeUTF(baseSplit.getClass().getCanonicalName());
        baseSplit.write(out);
    }

}