com.intel.genomicsdb.GenomicsDBInputFormat.java Source code

Java tutorial

Introduction

Here is the source code for com.intel.genomicsdb.GenomicsDBInputFormat.java

Source

/*
 * The MIT License (MIT)
 * Copyright (c) 2016 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy of
 * this software and associated documentation files (the "Software"), to deal in
 * the Software without restriction, including without limitation the rights to
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 * the Software, and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

package com.intel.genomicsdb;

import htsjdk.tribble.Feature;
import htsjdk.tribble.FeatureCodec;
import htsjdk.variant.bcf2.BCF2Codec;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.*;
import org.apache.log4j.Logger;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class GenomicsDBInputFormat<VCONTEXT extends Feature, SOURCE> extends InputFormat<String, VCONTEXT>
        implements Configurable {

    private GenomicsDBConfiguration genomicsDBConfiguration;
    private Configuration configuration;

    Logger logger = Logger.getLogger(GenomicsDBInputFormat.class);

    /**
     * When this function is called, it is already assumed that configuration
     * object is set
     *
     * @param jobContext  Hadoop Job context passed from newAPIHadoopRDD
     *                    defined in SparkContext
     * @return  Returns a list of input splits
     * @throws FileNotFoundException  Thrown if creaing configuration object fails
     */
    public List<InputSplit> getSplits(JobContext jobContext) throws FileNotFoundException {

        genomicsDBConfiguration = new GenomicsDBConfiguration(configuration);
        genomicsDBConfiguration.setLoaderJsonFile(configuration.get(GenomicsDBConfiguration.LOADERJSON));
        genomicsDBConfiguration.setQueryJsonFile(configuration.get(GenomicsDBConfiguration.QUERYJSON));
        genomicsDBConfiguration.setHostFile(configuration.get(GenomicsDBConfiguration.MPIHOSTFILE));

        List<String> hosts = genomicsDBConfiguration.getHosts();

        ArrayList<InputSplit> inputSplits = new ArrayList<>(hosts.size());
        for (int i = 0; i < hosts.size(); ++i) {
            GenomicsDBInputSplit split = new GenomicsDBInputSplit();
            inputSplits.add(split);
        }

        return inputSplits;
    }

    public RecordReader<String, VCONTEXT> createRecordReader(InputSplit inputSplit,
            TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {

        String loaderJson;
        String queryJson;

        GenomicsDBFeatureReader<VCONTEXT, SOURCE> featureReader;
        GenomicsDBRecordReader<VCONTEXT, SOURCE> recordReader;

        if (taskAttemptContext != null) {
            Configuration configuration = taskAttemptContext.getConfiguration();
            loaderJson = configuration.get(GenomicsDBConfiguration.LOADERJSON);
            queryJson = configuration.get(GenomicsDBConfiguration.QUERYJSON);
        } else {
            // If control comes here, means this method is called from
            // GenomicsDBRDD. Hence, the configuration object must be
            // set by setConf method, else this will lead to
            // NullPointerException
            assert (configuration != null);
            loaderJson = configuration.get(GenomicsDBConfiguration.LOADERJSON);
            queryJson = configuration.get(GenomicsDBConfiguration.QUERYJSON);
        }

        featureReader = new GenomicsDBFeatureReader<VCONTEXT, SOURCE>(loaderJson, queryJson,
                (FeatureCodec<VCONTEXT, SOURCE>) new BCF2Codec());
        recordReader = new GenomicsDBRecordReader<VCONTEXT, SOURCE>(featureReader);
        return recordReader;
    }

    /**
     * default constructor
     */
    public GenomicsDBInputFormat() {
    }

    public GenomicsDBInputFormat(GenomicsDBConfiguration conf) {
        genomicsDBConfiguration = conf;
    }

    /**
     * Set the loader JSON file path
     *
     * @param jsonFile  Full qualified path of the loader JSON file
     * @return  Returns the same object for forward function calls
     */
    public GenomicsDBInputFormat<VCONTEXT, SOURCE> setLoaderJsonFile(String jsonFile) {
        genomicsDBConfiguration.setLoaderJsonFile(jsonFile);
        return this;
    }

    /**
     * Set the query JSON file path
     * @param jsonFile  Full qualified path of the query JSON file
     * @return  Returns the same object for forward function calls
     */
    public GenomicsDBInputFormat<VCONTEXT, SOURCE> setQueryJsonFile(String jsonFile) {
        genomicsDBConfiguration.setQueryJsonFile(jsonFile);
        return this;
    }

    /**
     * Set the host file path
     * @param hostFile  Full qualified path of the hosts file
     * @return  Returns the same object for forward function calls
     * @throws FileNotFoundException thrown if the hosts file is not found
     */
    public GenomicsDBInputFormat<VCONTEXT, SOURCE> setHostFile(String hostFile) throws FileNotFoundException {
        genomicsDBConfiguration.setHostFile(hostFile);
        return this;
    }

    @Override
    public void setConf(Configuration configuration) {
        this.configuration = configuration;
    }

    @Override
    public Configuration getConf() {
        return configuration;
    }
}