Example usage for org.apache.hadoop.mapreduce JobContext getWorkingDirectory

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getWorkingDirectory.

Prototype

public Path getWorkingDirectory() throws IOException;

Source Link

Document

Get the current working directory for the default file system.

Usage

From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigInputFormat.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
@Override/* ww w .ja va  2s  .  c o m*/
public List<InputSplit> getSplits(JobContext jobcontext) throws IOException, InterruptedException {

    Configuration conf = jobcontext.getConfiguration();

    ArrayList<FileSpec> inputs;
    ArrayList<ArrayList<OperatorKey>> inpTargets;
    PigContext pigContext;
    try {
        inputs = (ArrayList<FileSpec>) ObjectSerializer.deserialize(conf.get("pig.inputs"));
        inpTargets = (ArrayList<ArrayList<OperatorKey>>) ObjectSerializer
                .deserialize(conf.get("pig.inpTargets"));
        pigContext = (PigContext) ObjectSerializer.deserialize(conf.get("pig.pigContext"));
        PigContext.setPackageImportList(
                (ArrayList<String>) ObjectSerializer.deserialize(conf.get("udf.import.list")));
        MapRedUtil.setupUDFContext(conf);
    } catch (Exception e) {
        int errCode = 2094;
        String msg = "Unable to deserialize object.";
        throw new ExecException(msg, errCode, PigException.BUG, e);
    }

    ArrayList<InputSplit> splits = new ArrayList<InputSplit>();
    for (int i = 0; i < inputs.size(); i++) {
        try {
            Path path = new Path(inputs.get(i).getFileName());

            FileSystem fs;
            boolean isFsPath = true;
            try {
                fs = path.getFileSystem(conf);
            } catch (Exception e) {
                // If an application specific
                // scheme was used
                // (e.g.: "hbase://table") we will fail
                // getting the file system. That's
                // ok, we just use the dfs in that case.
                fs = new Path("/").getFileSystem(conf);
                isFsPath = false;
            }

            // if the execution is against Mapred DFS, set
            // working dir to /user/<userid>
            if (!Utils.isLocal(pigContext, conf)) {
                fs.setWorkingDirectory(jobcontext.getWorkingDirectory());
            }

            // first pass input location to the loader - for this send a
            // clone of the configuration we have - this is so that if the
            // loader (or the inputformat of the loader) decide to store the
            // input location into the configuration (for example,
            // FileInputFormat stores this in mapred.input.dir in the conf),
            // then for different inputs, the loader's don't end up
            // over-writing the same conf.
            FuncSpec loadFuncSpec = inputs.get(i).getFuncSpec();
            LoadFunc loadFunc = (LoadFunc) PigContext.instantiateFuncFromSpec(loadFuncSpec);
            boolean combinable = !(loadFunc instanceof MergeJoinIndexer || loadFunc instanceof IndexableLoadFunc
                    || (loadFunc instanceof CollectableLoadFunc && loadFunc instanceof OrderedLoadFunc));
            if (combinable)
                combinable = !conf.getBoolean("pig.noSplitCombination", false);
            JobConf confClone = new JobConf(conf);
            Job inputSpecificJob = new Job(confClone);
            // Pass loader signature to LoadFunc and to InputFormat through
            // the conf
            passLoadSignature(loadFunc, i, inputSpecificJob.getConfiguration());
            loadFunc.setLocation(inputs.get(i).getFileName(), inputSpecificJob);
            // The above setLocation call could write to the conf within
            // the inputSpecificJob - use this updated conf

            // get the InputFormat from it and ask for splits
            InputFormat inpFormat = loadFunc.getInputFormat();
            List<InputSplit> oneInputSplits = inpFormat.getSplits(
                    HadoopShims.createJobContext(inputSpecificJob.getConfiguration(), jobcontext.getJobID()));
            List<InputSplit> oneInputPigSplits = getPigSplits(oneInputSplits, i, inpTargets.get(i),
                    HadoopShims.getDefaultBlockSize(fs, isFsPath ? path : fs.getWorkingDirectory()), combinable,
                    confClone);
            splits.addAll(oneInputPigSplits);
        } catch (ExecException ee) {
            throw ee;
        } catch (Exception e) {
            int errCode = 2118;
            String msg = "Unable to create input splits for: " + inputs.get(i).getFileName();
            if (e.getMessage() != null && (!e.getMessage().isEmpty())) {
                throw new ExecException(e.getMessage(), errCode, PigException.BUG, e);
            } else {
                throw new ExecException(msg, errCode, PigException.BUG, e);
            }
        }
    }

    // XXX hadoop 20 new API integration: get around a hadoop 20 bug by
    // passing total # of splits to each split so that it can be retrieved
    // in the RecordReader method when called by mapreduce framework later.
    int n = splits.size();
    // also passing the multi-input flag to the back-end so that
    // the multi-input record counters can be created
    int m = inputs.size();

    boolean disableCounter = conf.getBoolean("pig.disable.counter", false);
    if ((m > 1) && disableCounter) {
        log.info("Disable Pig custom input counters");
    }

    for (InputSplit split : splits) {
        ((PigSplit) split).setTotalSplits(n);
        if (m > 1)
            ((PigSplit) split).setMultiInputs(true);
        ((PigSplit) split).setDisableCounter(disableCounter);
    }

    return splits;
}

From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigInputSplitFormat.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
@Override//from w  w w . jav a  2s  . c om
public List<InputSplit> getSplits(JobContext jobcontext) throws IOException {

    Configuration conf = jobcontext.getConfiguration();

    ArrayList<FileSpec> inputs;
    ArrayList<ArrayList<OperatorKey>> inpTargets;
    PigContext pigContext;
    try {
        inputs = (ArrayList<FileSpec>) ObjectSerializer.deserialize(conf.get("pig.inputs"));
        inpTargets = (ArrayList<ArrayList<OperatorKey>>) ObjectSerializer
                .deserialize(conf.get("pig.inpTargets"));
        pigContext = (PigContext) ObjectSerializer.deserialize(conf.get("pig.pigContext"));
        PigContext.setPackageImportList(
                (ArrayList<String>) ObjectSerializer.deserialize(conf.get("udf.import.list")));
        MapRedUtil.setupUDFContext(conf);
    } catch (Exception e) {
        int errCode = 2094;
        String msg = "Unable to deserialize object.";
        throw new ExecException(msg, errCode, PigException.BUG, e);
    }

    ArrayList<InputSplit> splits = new ArrayList<InputSplit>();
    for (int i = 0; i < inputs.size(); i++) {
        try {
            Path path = new Path(inputs.get(i).getFileName());

            FileSystem fs;
            boolean isFsPath = true;
            try {
                fs = path.getFileSystem(conf);
            } catch (Exception e) {
                // If an application specific
                // scheme was used
                // (e.g.: "hbase://table") we will fail
                // getting the file system. That's
                // ok, we just use the dfs in that case.
                fs = new Path("/").getFileSystem(conf);
                isFsPath = false;
            }

            // if the execution is against Mapred DFS, set
            // working dir to /user/<userid>
            if (!Utils.isLocal(pigContext, conf)) {
                fs.setWorkingDirectory(jobcontext.getWorkingDirectory());
            }

            // first pass input location to the loader - for this send a
            // clone of the configuration we have - this is so that if the
            // loader (or the inputformat of the loader) decide to store the
            // input location into the configuration (for example,
            // FileInputFormat stores this in mapred.input.dir in the conf),
            // then for different inputs, the loader's don't end up
            // over-writing the same conf.
            FuncSpec loadFuncSpec = inputs.get(i).getFuncSpec();
            LoadFunc loadFunc = (LoadFunc) PigContext.instantiateFuncFromSpec(loadFuncSpec);
            boolean combinable = !(loadFunc instanceof MergeJoinIndexer || loadFunc instanceof IndexableLoadFunc
                    || (loadFunc instanceof CollectableLoadFunc && loadFunc instanceof OrderedLoadFunc));
            if (combinable)
                combinable = !conf.getBoolean("pig.noSplitCombination", false);
            Configuration confClone = new Configuration(conf);
            Job inputSpecificJob = new Job(confClone);
            // Pass loader signature to LoadFunc and to InputFormat through
            // the conf
            passLoadSignature(loadFunc, i, inputSpecificJob.getConfiguration());
            loadFunc.setLocation(inputs.get(i).getFileName(), inputSpecificJob);
            // The above setLocation call could write to the conf within
            // the inputSpecificJob - use this updated conf

            // get the InputFormat from it and ask for splits
            InputFormat inpFormat = loadFunc.getInputFormat();
            // List<InputSplit> oneInputSplits = inpFormat.getSplits(
            // HadoopShims.createJobContext(inputSpecificJob.getConfiguration(),
            // jobcontext.getJobID()));

            List<InputSplit> oneInputSplits = getSplitsSample(jobcontext);

            List<InputSplit> oneInputPigSplits = getPigSplits(oneInputSplits, i, inpTargets.get(i),
                    HadoopShims.getDefaultBlockSize(fs, isFsPath ? path : fs.getWorkingDirectory()), combinable,
                    confClone);
            splits.addAll(oneInputPigSplits);
        } catch (ExecException ee) {
            throw ee;
        } catch (Exception e) {
            int errCode = 2118;
            String msg = "Unable to create input splits for: " + inputs.get(i).getFileName();
            if (e.getMessage() != null && (!e.getMessage().isEmpty())) {
                throw new ExecException(e.getMessage(), errCode, PigException.BUG, e);
            } else {
                throw new ExecException(msg, errCode, PigException.BUG, e);
            }
        }
    }

    // XXX hadoop 20 new API integration: get around a hadoop 20 bug by
    // passing total # of splits to each split so that it can be retrieved
    // in the RecordReader method when called by mapreduce framework later.
    int n = splits.size();
    // also passing the multi-input flag to the back-end so that
    // the multi-input record counters can be created
    int m = inputs.size();

    boolean disableCounter = conf.getBoolean("pig.disable.counter", false);
    if ((m > 1) && disableCounter) {
        log.info("Disable Pig custom input counters");
    }

    for (InputSplit split : splits) {
        ((PigSplit) split).setTotalSplits(n);
        if (m > 1)
            ((PigSplit) split).setMultiInputs(true);
        ((PigSplit) split).setDisableCounter(disableCounter);
    }
    // shuffle --> return splits
    return splits;
}

From source file:org.apache.taverna.platform.execution.impl.hadoop.CrossProductInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {

    // Generate splits. Split is a list of directories where each directory 
    // contains inputs for one input port of the Taverna processor/activity we 
    // are invoking. 
    // We will have only one split for cross product that will know about all
    // the files in all input directories and will generate RecordReaders 
    // for every combination of files inside these directories.
    //       CrossProductInputSplit split = new CrossProductInputSplit();

    // List the input port directories contained in the input directory passed 
    // in from the command line.
    List<FileStatus> inputPortDirectories = listStatus(job);

    final FileSystem fs = job.getWorkingDirectory().getFileSystem(job.getConfiguration());
    Path workingDirectory = job.getWorkingDirectory();
    System.out.println("Working directory: " + workingDirectory);
    System.out.println("Adding directories to the cross product split:");
    ArrayList<Path> inputPortDirectoriesPaths = new ArrayList<Path>();
    for (FileStatus inputPortDirectory : inputPortDirectories) {
        // TODO input port directories need to be ordered in the order of the 
        // input ports of the Taverna processor/activity they are going into

        //inputPortDirectoriesPaths.add(new Text(inputPortDirectory.getPath().toString()));
        inputPortDirectoriesPaths.add(inputPortDirectory.getPath());
        System.out.println(inputPortDirectory.getPath());

    }/*w  w w .  j  av  a 2s .c o  m*/
    CrossProductInputSplit split = new CrossProductInputSplit(workingDirectory, inputPortDirectoriesPaths);

    List<InputSplit> splits = new ArrayList<InputSplit>();
    splits.add(split);

    return splits;
}