List of usage examples for org.apache.hadoop.mapreduce JobContext getWorkingDirectory
public Path getWorkingDirectory() throws IOException;
From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigInputFormat.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) @Override/* ww w .ja va 2s . c o m*/ public List<InputSplit> getSplits(JobContext jobcontext) throws IOException, InterruptedException { Configuration conf = jobcontext.getConfiguration(); ArrayList<FileSpec> inputs; ArrayList<ArrayList<OperatorKey>> inpTargets; PigContext pigContext; try { inputs = (ArrayList<FileSpec>) ObjectSerializer.deserialize(conf.get("pig.inputs")); inpTargets = (ArrayList<ArrayList<OperatorKey>>) ObjectSerializer .deserialize(conf.get("pig.inpTargets")); pigContext = (PigContext) ObjectSerializer.deserialize(conf.get("pig.pigContext")); PigContext.setPackageImportList( (ArrayList<String>) ObjectSerializer.deserialize(conf.get("udf.import.list"))); MapRedUtil.setupUDFContext(conf); } catch (Exception e) { int errCode = 2094; String msg = "Unable to deserialize object."; throw new ExecException(msg, errCode, PigException.BUG, e); } ArrayList<InputSplit> splits = new ArrayList<InputSplit>(); for (int i = 0; i < inputs.size(); i++) { try { Path path = new Path(inputs.get(i).getFileName()); FileSystem fs; boolean isFsPath = true; try { fs = path.getFileSystem(conf); } catch (Exception e) { // If an application specific // scheme was used // (e.g.: "hbase://table") we will fail // getting the file system. That's // ok, we just use the dfs in that case. fs = new Path("/").getFileSystem(conf); isFsPath = false; } // if the execution is against Mapred DFS, set // working dir to /user/<userid> if (!Utils.isLocal(pigContext, conf)) { fs.setWorkingDirectory(jobcontext.getWorkingDirectory()); } // first pass input location to the loader - for this send a // clone of the configuration we have - this is so that if the // loader (or the inputformat of the loader) decide to store the // input location into the configuration (for example, // FileInputFormat stores this in mapred.input.dir in the conf), // then for different inputs, the loader's don't end up // over-writing the same conf. FuncSpec loadFuncSpec = inputs.get(i).getFuncSpec(); LoadFunc loadFunc = (LoadFunc) PigContext.instantiateFuncFromSpec(loadFuncSpec); boolean combinable = !(loadFunc instanceof MergeJoinIndexer || loadFunc instanceof IndexableLoadFunc || (loadFunc instanceof CollectableLoadFunc && loadFunc instanceof OrderedLoadFunc)); if (combinable) combinable = !conf.getBoolean("pig.noSplitCombination", false); JobConf confClone = new JobConf(conf); Job inputSpecificJob = new Job(confClone); // Pass loader signature to LoadFunc and to InputFormat through // the conf passLoadSignature(loadFunc, i, inputSpecificJob.getConfiguration()); loadFunc.setLocation(inputs.get(i).getFileName(), inputSpecificJob); // The above setLocation call could write to the conf within // the inputSpecificJob - use this updated conf // get the InputFormat from it and ask for splits InputFormat inpFormat = loadFunc.getInputFormat(); List<InputSplit> oneInputSplits = inpFormat.getSplits( HadoopShims.createJobContext(inputSpecificJob.getConfiguration(), jobcontext.getJobID())); List<InputSplit> oneInputPigSplits = getPigSplits(oneInputSplits, i, inpTargets.get(i), HadoopShims.getDefaultBlockSize(fs, isFsPath ? path : fs.getWorkingDirectory()), combinable, confClone); splits.addAll(oneInputPigSplits); } catch (ExecException ee) { throw ee; } catch (Exception e) { int errCode = 2118; String msg = "Unable to create input splits for: " + inputs.get(i).getFileName(); if (e.getMessage() != null && (!e.getMessage().isEmpty())) { throw new ExecException(e.getMessage(), errCode, PigException.BUG, e); } else { throw new ExecException(msg, errCode, PigException.BUG, e); } } } // XXX hadoop 20 new API integration: get around a hadoop 20 bug by // passing total # of splits to each split so that it can be retrieved // in the RecordReader method when called by mapreduce framework later. int n = splits.size(); // also passing the multi-input flag to the back-end so that // the multi-input record counters can be created int m = inputs.size(); boolean disableCounter = conf.getBoolean("pig.disable.counter", false); if ((m > 1) && disableCounter) { log.info("Disable Pig custom input counters"); } for (InputSplit split : splits) { ((PigSplit) split).setTotalSplits(n); if (m > 1) ((PigSplit) split).setMultiInputs(true); ((PigSplit) split).setDisableCounter(disableCounter); } return splits; }
From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigInputSplitFormat.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) @Override//from w w w . jav a 2s . c om public List<InputSplit> getSplits(JobContext jobcontext) throws IOException { Configuration conf = jobcontext.getConfiguration(); ArrayList<FileSpec> inputs; ArrayList<ArrayList<OperatorKey>> inpTargets; PigContext pigContext; try { inputs = (ArrayList<FileSpec>) ObjectSerializer.deserialize(conf.get("pig.inputs")); inpTargets = (ArrayList<ArrayList<OperatorKey>>) ObjectSerializer .deserialize(conf.get("pig.inpTargets")); pigContext = (PigContext) ObjectSerializer.deserialize(conf.get("pig.pigContext")); PigContext.setPackageImportList( (ArrayList<String>) ObjectSerializer.deserialize(conf.get("udf.import.list"))); MapRedUtil.setupUDFContext(conf); } catch (Exception e) { int errCode = 2094; String msg = "Unable to deserialize object."; throw new ExecException(msg, errCode, PigException.BUG, e); } ArrayList<InputSplit> splits = new ArrayList<InputSplit>(); for (int i = 0; i < inputs.size(); i++) { try { Path path = new Path(inputs.get(i).getFileName()); FileSystem fs; boolean isFsPath = true; try { fs = path.getFileSystem(conf); } catch (Exception e) { // If an application specific // scheme was used // (e.g.: "hbase://table") we will fail // getting the file system. That's // ok, we just use the dfs in that case. fs = new Path("/").getFileSystem(conf); isFsPath = false; } // if the execution is against Mapred DFS, set // working dir to /user/<userid> if (!Utils.isLocal(pigContext, conf)) { fs.setWorkingDirectory(jobcontext.getWorkingDirectory()); } // first pass input location to the loader - for this send a // clone of the configuration we have - this is so that if the // loader (or the inputformat of the loader) decide to store the // input location into the configuration (for example, // FileInputFormat stores this in mapred.input.dir in the conf), // then for different inputs, the loader's don't end up // over-writing the same conf. FuncSpec loadFuncSpec = inputs.get(i).getFuncSpec(); LoadFunc loadFunc = (LoadFunc) PigContext.instantiateFuncFromSpec(loadFuncSpec); boolean combinable = !(loadFunc instanceof MergeJoinIndexer || loadFunc instanceof IndexableLoadFunc || (loadFunc instanceof CollectableLoadFunc && loadFunc instanceof OrderedLoadFunc)); if (combinable) combinable = !conf.getBoolean("pig.noSplitCombination", false); Configuration confClone = new Configuration(conf); Job inputSpecificJob = new Job(confClone); // Pass loader signature to LoadFunc and to InputFormat through // the conf passLoadSignature(loadFunc, i, inputSpecificJob.getConfiguration()); loadFunc.setLocation(inputs.get(i).getFileName(), inputSpecificJob); // The above setLocation call could write to the conf within // the inputSpecificJob - use this updated conf // get the InputFormat from it and ask for splits InputFormat inpFormat = loadFunc.getInputFormat(); // List<InputSplit> oneInputSplits = inpFormat.getSplits( // HadoopShims.createJobContext(inputSpecificJob.getConfiguration(), // jobcontext.getJobID())); List<InputSplit> oneInputSplits = getSplitsSample(jobcontext); List<InputSplit> oneInputPigSplits = getPigSplits(oneInputSplits, i, inpTargets.get(i), HadoopShims.getDefaultBlockSize(fs, isFsPath ? path : fs.getWorkingDirectory()), combinable, confClone); splits.addAll(oneInputPigSplits); } catch (ExecException ee) { throw ee; } catch (Exception e) { int errCode = 2118; String msg = "Unable to create input splits for: " + inputs.get(i).getFileName(); if (e.getMessage() != null && (!e.getMessage().isEmpty())) { throw new ExecException(e.getMessage(), errCode, PigException.BUG, e); } else { throw new ExecException(msg, errCode, PigException.BUG, e); } } } // XXX hadoop 20 new API integration: get around a hadoop 20 bug by // passing total # of splits to each split so that it can be retrieved // in the RecordReader method when called by mapreduce framework later. int n = splits.size(); // also passing the multi-input flag to the back-end so that // the multi-input record counters can be created int m = inputs.size(); boolean disableCounter = conf.getBoolean("pig.disable.counter", false); if ((m > 1) && disableCounter) { log.info("Disable Pig custom input counters"); } for (InputSplit split : splits) { ((PigSplit) split).setTotalSplits(n); if (m > 1) ((PigSplit) split).setMultiInputs(true); ((PigSplit) split).setDisableCounter(disableCounter); } // shuffle --> return splits return splits; }
From source file:org.apache.taverna.platform.execution.impl.hadoop.CrossProductInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { // Generate splits. Split is a list of directories where each directory // contains inputs for one input port of the Taverna processor/activity we // are invoking. // We will have only one split for cross product that will know about all // the files in all input directories and will generate RecordReaders // for every combination of files inside these directories. // CrossProductInputSplit split = new CrossProductInputSplit(); // List the input port directories contained in the input directory passed // in from the command line. List<FileStatus> inputPortDirectories = listStatus(job); final FileSystem fs = job.getWorkingDirectory().getFileSystem(job.getConfiguration()); Path workingDirectory = job.getWorkingDirectory(); System.out.println("Working directory: " + workingDirectory); System.out.println("Adding directories to the cross product split:"); ArrayList<Path> inputPortDirectoriesPaths = new ArrayList<Path>(); for (FileStatus inputPortDirectory : inputPortDirectories) { // TODO input port directories need to be ordered in the order of the // input ports of the Taverna processor/activity they are going into //inputPortDirectoriesPaths.add(new Text(inputPortDirectory.getPath().toString())); inputPortDirectoriesPaths.add(inputPortDirectory.getPath()); System.out.println(inputPortDirectory.getPath()); }/*w w w . j av a 2s .c o m*/ CrossProductInputSplit split = new CrossProductInputSplit(workingDirectory, inputPortDirectoriesPaths); List<InputSplit> splits = new ArrayList<InputSplit>(); splits.add(split); return splits; }