List of usage examples for org.apache.hadoop.mapreduce JobContext getWorkingDirectory
public Path getWorkingDirectory() throws IOException;
From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigInputFormat.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" })
@Override/* ww w .ja va 2s . c o m*/
public List<InputSplit> getSplits(JobContext jobcontext) throws IOException, InterruptedException {
Configuration conf = jobcontext.getConfiguration();
ArrayList<FileSpec> inputs;
ArrayList<ArrayList<OperatorKey>> inpTargets;
PigContext pigContext;
try {
inputs = (ArrayList<FileSpec>) ObjectSerializer.deserialize(conf.get("pig.inputs"));
inpTargets = (ArrayList<ArrayList<OperatorKey>>) ObjectSerializer
.deserialize(conf.get("pig.inpTargets"));
pigContext = (PigContext) ObjectSerializer.deserialize(conf.get("pig.pigContext"));
PigContext.setPackageImportList(
(ArrayList<String>) ObjectSerializer.deserialize(conf.get("udf.import.list")));
MapRedUtil.setupUDFContext(conf);
} catch (Exception e) {
int errCode = 2094;
String msg = "Unable to deserialize object.";
throw new ExecException(msg, errCode, PigException.BUG, e);
}
ArrayList<InputSplit> splits = new ArrayList<InputSplit>();
for (int i = 0; i < inputs.size(); i++) {
try {
Path path = new Path(inputs.get(i).getFileName());
FileSystem fs;
boolean isFsPath = true;
try {
fs = path.getFileSystem(conf);
} catch (Exception e) {
// If an application specific
// scheme was used
// (e.g.: "hbase://table") we will fail
// getting the file system. That's
// ok, we just use the dfs in that case.
fs = new Path("/").getFileSystem(conf);
isFsPath = false;
}
// if the execution is against Mapred DFS, set
// working dir to /user/<userid>
if (!Utils.isLocal(pigContext, conf)) {
fs.setWorkingDirectory(jobcontext.getWorkingDirectory());
}
// first pass input location to the loader - for this send a
// clone of the configuration we have - this is so that if the
// loader (or the inputformat of the loader) decide to store the
// input location into the configuration (for example,
// FileInputFormat stores this in mapred.input.dir in the conf),
// then for different inputs, the loader's don't end up
// over-writing the same conf.
FuncSpec loadFuncSpec = inputs.get(i).getFuncSpec();
LoadFunc loadFunc = (LoadFunc) PigContext.instantiateFuncFromSpec(loadFuncSpec);
boolean combinable = !(loadFunc instanceof MergeJoinIndexer || loadFunc instanceof IndexableLoadFunc
|| (loadFunc instanceof CollectableLoadFunc && loadFunc instanceof OrderedLoadFunc));
if (combinable)
combinable = !conf.getBoolean("pig.noSplitCombination", false);
JobConf confClone = new JobConf(conf);
Job inputSpecificJob = new Job(confClone);
// Pass loader signature to LoadFunc and to InputFormat through
// the conf
passLoadSignature(loadFunc, i, inputSpecificJob.getConfiguration());
loadFunc.setLocation(inputs.get(i).getFileName(), inputSpecificJob);
// The above setLocation call could write to the conf within
// the inputSpecificJob - use this updated conf
// get the InputFormat from it and ask for splits
InputFormat inpFormat = loadFunc.getInputFormat();
List<InputSplit> oneInputSplits = inpFormat.getSplits(
HadoopShims.createJobContext(inputSpecificJob.getConfiguration(), jobcontext.getJobID()));
List<InputSplit> oneInputPigSplits = getPigSplits(oneInputSplits, i, inpTargets.get(i),
HadoopShims.getDefaultBlockSize(fs, isFsPath ? path : fs.getWorkingDirectory()), combinable,
confClone);
splits.addAll(oneInputPigSplits);
} catch (ExecException ee) {
throw ee;
} catch (Exception e) {
int errCode = 2118;
String msg = "Unable to create input splits for: " + inputs.get(i).getFileName();
if (e.getMessage() != null && (!e.getMessage().isEmpty())) {
throw new ExecException(e.getMessage(), errCode, PigException.BUG, e);
} else {
throw new ExecException(msg, errCode, PigException.BUG, e);
}
}
}
// XXX hadoop 20 new API integration: get around a hadoop 20 bug by
// passing total # of splits to each split so that it can be retrieved
// in the RecordReader method when called by mapreduce framework later.
int n = splits.size();
// also passing the multi-input flag to the back-end so that
// the multi-input record counters can be created
int m = inputs.size();
boolean disableCounter = conf.getBoolean("pig.disable.counter", false);
if ((m > 1) && disableCounter) {
log.info("Disable Pig custom input counters");
}
for (InputSplit split : splits) {
((PigSplit) split).setTotalSplits(n);
if (m > 1)
((PigSplit) split).setMultiInputs(true);
((PigSplit) split).setDisableCounter(disableCounter);
}
return splits;
}
From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigInputSplitFormat.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" })
@Override//from w w w . jav a 2s . c om
public List<InputSplit> getSplits(JobContext jobcontext) throws IOException {
Configuration conf = jobcontext.getConfiguration();
ArrayList<FileSpec> inputs;
ArrayList<ArrayList<OperatorKey>> inpTargets;
PigContext pigContext;
try {
inputs = (ArrayList<FileSpec>) ObjectSerializer.deserialize(conf.get("pig.inputs"));
inpTargets = (ArrayList<ArrayList<OperatorKey>>) ObjectSerializer
.deserialize(conf.get("pig.inpTargets"));
pigContext = (PigContext) ObjectSerializer.deserialize(conf.get("pig.pigContext"));
PigContext.setPackageImportList(
(ArrayList<String>) ObjectSerializer.deserialize(conf.get("udf.import.list")));
MapRedUtil.setupUDFContext(conf);
} catch (Exception e) {
int errCode = 2094;
String msg = "Unable to deserialize object.";
throw new ExecException(msg, errCode, PigException.BUG, e);
}
ArrayList<InputSplit> splits = new ArrayList<InputSplit>();
for (int i = 0; i < inputs.size(); i++) {
try {
Path path = new Path(inputs.get(i).getFileName());
FileSystem fs;
boolean isFsPath = true;
try {
fs = path.getFileSystem(conf);
} catch (Exception e) {
// If an application specific
// scheme was used
// (e.g.: "hbase://table") we will fail
// getting the file system. That's
// ok, we just use the dfs in that case.
fs = new Path("/").getFileSystem(conf);
isFsPath = false;
}
// if the execution is against Mapred DFS, set
// working dir to /user/<userid>
if (!Utils.isLocal(pigContext, conf)) {
fs.setWorkingDirectory(jobcontext.getWorkingDirectory());
}
// first pass input location to the loader - for this send a
// clone of the configuration we have - this is so that if the
// loader (or the inputformat of the loader) decide to store the
// input location into the configuration (for example,
// FileInputFormat stores this in mapred.input.dir in the conf),
// then for different inputs, the loader's don't end up
// over-writing the same conf.
FuncSpec loadFuncSpec = inputs.get(i).getFuncSpec();
LoadFunc loadFunc = (LoadFunc) PigContext.instantiateFuncFromSpec(loadFuncSpec);
boolean combinable = !(loadFunc instanceof MergeJoinIndexer || loadFunc instanceof IndexableLoadFunc
|| (loadFunc instanceof CollectableLoadFunc && loadFunc instanceof OrderedLoadFunc));
if (combinable)
combinable = !conf.getBoolean("pig.noSplitCombination", false);
Configuration confClone = new Configuration(conf);
Job inputSpecificJob = new Job(confClone);
// Pass loader signature to LoadFunc and to InputFormat through
// the conf
passLoadSignature(loadFunc, i, inputSpecificJob.getConfiguration());
loadFunc.setLocation(inputs.get(i).getFileName(), inputSpecificJob);
// The above setLocation call could write to the conf within
// the inputSpecificJob - use this updated conf
// get the InputFormat from it and ask for splits
InputFormat inpFormat = loadFunc.getInputFormat();
// List<InputSplit> oneInputSplits = inpFormat.getSplits(
// HadoopShims.createJobContext(inputSpecificJob.getConfiguration(),
// jobcontext.getJobID()));
List<InputSplit> oneInputSplits = getSplitsSample(jobcontext);
List<InputSplit> oneInputPigSplits = getPigSplits(oneInputSplits, i, inpTargets.get(i),
HadoopShims.getDefaultBlockSize(fs, isFsPath ? path : fs.getWorkingDirectory()), combinable,
confClone);
splits.addAll(oneInputPigSplits);
} catch (ExecException ee) {
throw ee;
} catch (Exception e) {
int errCode = 2118;
String msg = "Unable to create input splits for: " + inputs.get(i).getFileName();
if (e.getMessage() != null && (!e.getMessage().isEmpty())) {
throw new ExecException(e.getMessage(), errCode, PigException.BUG, e);
} else {
throw new ExecException(msg, errCode, PigException.BUG, e);
}
}
}
// XXX hadoop 20 new API integration: get around a hadoop 20 bug by
// passing total # of splits to each split so that it can be retrieved
// in the RecordReader method when called by mapreduce framework later.
int n = splits.size();
// also passing the multi-input flag to the back-end so that
// the multi-input record counters can be created
int m = inputs.size();
boolean disableCounter = conf.getBoolean("pig.disable.counter", false);
if ((m > 1) && disableCounter) {
log.info("Disable Pig custom input counters");
}
for (InputSplit split : splits) {
((PigSplit) split).setTotalSplits(n);
if (m > 1)
((PigSplit) split).setMultiInputs(true);
((PigSplit) split).setDisableCounter(disableCounter);
}
// shuffle --> return splits
return splits;
}
From source file:org.apache.taverna.platform.execution.impl.hadoop.CrossProductInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { // Generate splits. Split is a list of directories where each directory // contains inputs for one input port of the Taverna processor/activity we // are invoking. // We will have only one split for cross product that will know about all // the files in all input directories and will generate RecordReaders // for every combination of files inside these directories. // CrossProductInputSplit split = new CrossProductInputSplit(); // List the input port directories contained in the input directory passed // in from the command line. List<FileStatus> inputPortDirectories = listStatus(job); final FileSystem fs = job.getWorkingDirectory().getFileSystem(job.getConfiguration()); Path workingDirectory = job.getWorkingDirectory(); System.out.println("Working directory: " + workingDirectory); System.out.println("Adding directories to the cross product split:"); ArrayList<Path> inputPortDirectoriesPaths = new ArrayList<Path>(); for (FileStatus inputPortDirectory : inputPortDirectories) { // TODO input port directories need to be ordered in the order of the // input ports of the Taverna processor/activity they are going into //inputPortDirectoriesPaths.add(new Text(inputPortDirectory.getPath().toString())); inputPortDirectoriesPaths.add(inputPortDirectory.getPath()); System.out.println(inputPortDirectory.getPath()); }/*w w w . j av a 2s .c o m*/ CrossProductInputSplit split = new CrossProductInputSplit(workingDirectory, inputPortDirectoriesPaths); List<InputSplit> splits = new ArrayList<InputSplit>(); splits.add(split); return splits; }