Example usage for org.apache.hadoop.mapreduce JobContext getWorkingDirectory

List of usage examples for org.apache.hadoop.mapreduce JobContext getWorkingDirectory

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getWorkingDirectory.

Prototype

public Path getWorkingDirectory() throws IOException;

Source Link

Document

Get the current working directory for the default file system.

Usage

From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigInputFormat.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
@Override/* ww w .ja va  2s  .  c o m*/
public List<InputSplit> getSplits(JobContext jobcontext) throws IOException, InterruptedException {

    Configuration conf = jobcontext.getConfiguration();

    ArrayList<FileSpec> inputs;
    ArrayList<ArrayList<OperatorKey>> inpTargets;
    PigContext pigContext;
    try {
        inputs = (ArrayList<FileSpec>) ObjectSerializer.deserialize(conf.get("pig.inputs"));
        inpTargets = (ArrayList<ArrayList<OperatorKey>>) ObjectSerializer
                .deserialize(conf.get("pig.inpTargets"));
        pigContext = (PigContext) ObjectSerializer.deserialize(conf.get("pig.pigContext"));
        PigContext.setPackageImportList(
                (ArrayList<String>) ObjectSerializer.deserialize(conf.get("udf.import.list")));
        MapRedUtil.setupUDFContext(conf);
    } catch (Exception e) {
        int errCode = 2094;
        String msg = "Unable to deserialize object.";
        throw new ExecException(msg, errCode, PigException.BUG, e);
    }

    ArrayList<InputSplit> splits = new ArrayList<InputSplit>();
    for (int i = 0; i < inputs.size(); i++) {
        try {
            Path path = new Path(inputs.get(i).getFileName());

            FileSystem fs;
            boolean isFsPath = true;
            try {
                fs = path.getFileSystem(conf);
            } catch (Exception e) {
                // If an application specific
                // scheme was used
                // (e.g.: "hbase://table") we will fail
                // getting the file system. That's
                // ok, we just use the dfs in that case.
                fs = new Path("/").getFileSystem(conf);
                isFsPath = false;
            }

            // if the execution is against Mapred DFS, set
            // working dir to /user/<userid>
            if (!Utils.isLocal(pigContext, conf)) {
                fs.setWorkingDirectory(jobcontext.getWorkingDirectory());
            }

            // first pass input location to the loader - for this send a
            // clone of the configuration we have - this is so that if the
            // loader (or the inputformat of the loader) decide to store the
            // input location into the configuration (for example,
            // FileInputFormat stores this in mapred.input.dir in the conf),
            // then for different inputs, the loader's don't end up
            // over-writing the same conf.
            FuncSpec loadFuncSpec = inputs.get(i).getFuncSpec();
            LoadFunc loadFunc = (LoadFunc) PigContext.instantiateFuncFromSpec(loadFuncSpec);
            boolean combinable = !(loadFunc instanceof MergeJoinIndexer || loadFunc instanceof IndexableLoadFunc
                    || (loadFunc instanceof CollectableLoadFunc && loadFunc instanceof OrderedLoadFunc));
            if (combinable)
                combinable = !conf.getBoolean("pig.noSplitCombination", false);
            JobConf confClone = new JobConf(conf);
            Job inputSpecificJob = new Job(confClone);
            // Pass loader signature to LoadFunc and to InputFormat through
            // the conf
            passLoadSignature(loadFunc, i, inputSpecificJob.getConfiguration());
            loadFunc.setLocation(inputs.get(i).getFileName(), inputSpecificJob);
            // The above setLocation call could write to the conf within
            // the inputSpecificJob - use this updated conf

            // get the InputFormat from it and ask for splits
            InputFormat inpFormat = loadFunc.getInputFormat();
            List<InputSplit> oneInputSplits = inpFormat.getSplits(
                    HadoopShims.createJobContext(inputSpecificJob.getConfiguration(), jobcontext.getJobID()));
            List<InputSplit> oneInputPigSplits = getPigSplits(oneInputSplits, i, inpTargets.get(i),
                    HadoopShims.getDefaultBlockSize(fs, isFsPath ? path : fs.getWorkingDirectory()), combinable,
                    confClone);
            splits.addAll(oneInputPigSplits);
        } catch (ExecException ee) {
            throw ee;
        } catch (Exception e) {
            int errCode = 2118;
            String msg = "Unable to create input splits for: " + inputs.get(i).getFileName();
            if (e.getMessage() != null && (!e.getMessage().isEmpty())) {
                throw new ExecException(e.getMessage(), errCode, PigException.BUG, e);
            } else {
                throw new ExecException(msg, errCode, PigException.BUG, e);
            }
        }
    }

    // XXX hadoop 20 new API integration: get around a hadoop 20 bug by
    // passing total # of splits to each split so that it can be retrieved
    // in the RecordReader method when called by mapreduce framework later.
    int n = splits.size();
    // also passing the multi-input flag to the back-end so that
    // the multi-input record counters can be created
    int m = inputs.size();

    boolean disableCounter = conf.getBoolean("pig.disable.counter", false);
    if ((m > 1) && disableCounter) {
        log.info("Disable Pig custom input counters");
    }

    for (InputSplit split : splits) {
        ((PigSplit) split).setTotalSplits(n);
        if (m > 1)
            ((PigSplit) split).setMultiInputs(true);
        ((PigSplit) split).setDisableCounter(disableCounter);
    }

    return splits;
}

From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigInputSplitFormat.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
@Override//from w  w w . jav a  2s  . c om
public List<InputSplit> getSplits(JobContext jobcontext) throws IOException {

    Configuration conf = jobcontext.getConfiguration();

    ArrayList<FileSpec> inputs;
    ArrayList<ArrayList<OperatorKey>> inpTargets;
    PigContext pigContext;
    try {
        inputs = (ArrayList<FileSpec>) ObjectSerializer.deserialize(conf.get("pig.inputs"));
        inpTargets = (ArrayList<ArrayList<OperatorKey>>) ObjectSerializer
                .deserialize(conf.get("pig.inpTargets"));
        pigContext = (PigContext) ObjectSerializer.deserialize(conf.get("pig.pigContext"));
        PigContext.setPackageImportList(
                (ArrayList<String>) ObjectSerializer.deserialize(conf.get("udf.import.list")));
        MapRedUtil.setupUDFContext(conf);
    } catch (Exception e) {
        int errCode = 2094;
        String msg = "Unable to deserialize object.";
        throw new ExecException(msg, errCode, PigException.BUG, e);
    }

    ArrayList<InputSplit> splits = new ArrayList<InputSplit>();
    for (int i = 0; i < inputs.size(); i++) {
        try {
            Path path = new Path(inputs.get(i).getFileName());

            FileSystem fs;
            boolean isFsPath = true;
            try {
                fs = path.getFileSystem(conf);
            } catch (Exception e) {
                // If an application specific
                // scheme was used
                // (e.g.: "hbase://table") we will fail
                // getting the file system. That's
                // ok, we just use the dfs in that case.
                fs = new Path("/").getFileSystem(conf);
                isFsPath = false;
            }

            // if the execution is against Mapred DFS, set
            // working dir to /user/<userid>
            if (!Utils.isLocal(pigContext, conf)) {
                fs.setWorkingDirectory(jobcontext.getWorkingDirectory());
            }

            // first pass input location to the loader - for this send a
            // clone of the configuration we have - this is so that if the
            // loader (or the inputformat of the loader) decide to store the
            // input location into the configuration (for example,
            // FileInputFormat stores this in mapred.input.dir in the conf),
            // then for different inputs, the loader's don't end up
            // over-writing the same conf.
            FuncSpec loadFuncSpec = inputs.get(i).getFuncSpec();
            LoadFunc loadFunc = (LoadFunc) PigContext.instantiateFuncFromSpec(loadFuncSpec);
            boolean combinable = !(loadFunc instanceof MergeJoinIndexer || loadFunc instanceof IndexableLoadFunc
                    || (loadFunc instanceof CollectableLoadFunc && loadFunc instanceof OrderedLoadFunc));
            if (combinable)
                combinable = !conf.getBoolean("pig.noSplitCombination", false);
            Configuration confClone = new Configuration(conf);
            Job inputSpecificJob = new Job(confClone);
            // Pass loader signature to LoadFunc and to InputFormat through
            // the conf
            passLoadSignature(loadFunc, i, inputSpecificJob.getConfiguration());
            loadFunc.setLocation(inputs.get(i).getFileName(), inputSpecificJob);
            // The above setLocation call could write to the conf within
            // the inputSpecificJob - use this updated conf

            // get the InputFormat from it and ask for splits
            InputFormat inpFormat = loadFunc.getInputFormat();
            // List<InputSplit> oneInputSplits = inpFormat.getSplits(
            // HadoopShims.createJobContext(inputSpecificJob.getConfiguration(),
            // jobcontext.getJobID()));

            List<InputSplit> oneInputSplits = getSplitsSample(jobcontext);

            List<InputSplit> oneInputPigSplits = getPigSplits(oneInputSplits, i, inpTargets.get(i),
                    HadoopShims.getDefaultBlockSize(fs, isFsPath ? path : fs.getWorkingDirectory()), combinable,
                    confClone);
            splits.addAll(oneInputPigSplits);
        } catch (ExecException ee) {
            throw ee;
        } catch (Exception e) {
            int errCode = 2118;
            String msg = "Unable to create input splits for: " + inputs.get(i).getFileName();
            if (e.getMessage() != null && (!e.getMessage().isEmpty())) {
                throw new ExecException(e.getMessage(), errCode, PigException.BUG, e);
            } else {
                throw new ExecException(msg, errCode, PigException.BUG, e);
            }
        }
    }

    // XXX hadoop 20 new API integration: get around a hadoop 20 bug by
    // passing total # of splits to each split so that it can be retrieved
    // in the RecordReader method when called by mapreduce framework later.
    int n = splits.size();
    // also passing the multi-input flag to the back-end so that
    // the multi-input record counters can be created
    int m = inputs.size();

    boolean disableCounter = conf.getBoolean("pig.disable.counter", false);
    if ((m > 1) && disableCounter) {
        log.info("Disable Pig custom input counters");
    }

    for (InputSplit split : splits) {
        ((PigSplit) split).setTotalSplits(n);
        if (m > 1)
            ((PigSplit) split).setMultiInputs(true);
        ((PigSplit) split).setDisableCounter(disableCounter);
    }
    // shuffle --> return splits
    return splits;
}

From source file:org.apache.taverna.platform.execution.impl.hadoop.CrossProductInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {

    // Generate splits. Split is a list of directories where each directory 
    // contains inputs for one input port of the Taverna processor/activity we 
    // are invoking. 
    // We will have only one split for cross product that will know about all
    // the files in all input directories and will generate RecordReaders 
    // for every combination of files inside these directories.
    //       CrossProductInputSplit split = new CrossProductInputSplit();

    // List the input port directories contained in the input directory passed 
    // in from the command line.
    List<FileStatus> inputPortDirectories = listStatus(job);

    final FileSystem fs = job.getWorkingDirectory().getFileSystem(job.getConfiguration());
    Path workingDirectory = job.getWorkingDirectory();
    System.out.println("Working directory: " + workingDirectory);
    System.out.println("Adding directories to the cross product split:");
    ArrayList<Path> inputPortDirectoriesPaths = new ArrayList<Path>();
    for (FileStatus inputPortDirectory : inputPortDirectories) {
        // TODO input port directories need to be ordered in the order of the 
        // input ports of the Taverna processor/activity they are going into

        //inputPortDirectoriesPaths.add(new Text(inputPortDirectory.getPath().toString()));
        inputPortDirectoriesPaths.add(inputPortDirectory.getPath());
        System.out.println(inputPortDirectory.getPath());

    }/*w  w w .  j  av  a 2s .c o  m*/
    CrossProductInputSplit split = new CrossProductInputSplit(workingDirectory, inputPortDirectoriesPaths);

    List<InputSplit> splits = new ArrayList<InputSplit>();
    splits.add(split);

    return splits;
}