List of usage examples for org.apache.hadoop.fs Path getFileSystem
public FileSystem getFileSystem(Configuration conf) throws IOException
From source file:com.datasalt.pangool.flow.BaseFlow.java
License:Apache License
public void execute(final EXECUTION_MODE mode, final Configuration conf, String... outputs) throws Exception { List<Step> toResolve = new ArrayList<Step>(); for (String output : outputs) { Step orig = findInOutputs(output); if (orig == null) { throw new IllegalArgumentException("Unknown output: " + output + " not found in flow context."); }//from w ww . ja v a2s .c o m toResolve.add(orig); } Step orig; final Map<String, Step> jobOutputBindings = new HashMap<String, Step>(); final Map<Step, Set<Step>> stepDependencies = new HashMap<Step, Set<Step>>(); while (toResolve.size() > 0) { Iterator<Step> it = toResolve.iterator(); orig = it.next(); it.remove(); Log.info("Resolving dependencies for " + orig.getName()); Set<Step> deps = new HashSet<Step>(); for (Input input : orig.getInputs()) { String inputName = orig.getName() + "." + input.name; String bindedTo = bindings.get(inputName); if (bindedTo == null) { throw new IllegalArgumentException( "Input " + inputName + " not binded to anything in current flow context."); } Step job = findInOutputs(bindedTo); if (job == null) { if (!inputs.contains(bindedTo)) { throw new IllegalArgumentException("Unknown input: " + bindedTo + " binded to " + inputName + " not found in flow context."); } } else { deps.add(job); jobOutputBindings.put(inputName, job); toResolve.add(job); } } stepDependencies.put(orig, deps); } Log.info("Steps to execute and dependencies: " + stepDependencies); Set<Step> completedSteps = new HashSet<Step>(); ExecutorService executor = Executors.newCachedThreadPool(); Set<Future<Step>> stepsBeingExecuted = new HashSet<Future<Step>>(); final AtomicBoolean flowFailed = new AtomicBoolean(false); while (stepDependencies.keySet().size() > 0) { // gather all steps at this level // steps to be executed at each moment are steps whose dependencies are only dependencies that already have been // executed Set<Step> stepsToExecuteInParallel = new HashSet<Step>(); for (Map.Entry<Step, Set<Step>> entry : stepDependencies.entrySet()) { boolean canBeExecuted = true; for (Step dependencyStep : entry.getValue()) { if (!completedSteps.contains(dependencyStep)) { canBeExecuted = false; break; } } if (canBeExecuted) { stepsToExecuteInParallel.add(entry.getKey()); } } if (stepsToExecuteInParallel.size() > 0) { Log.info("Launching parallel steps [" + stepsToExecuteInParallel + "]"); for (final Step job : stepsToExecuteInParallel) { stepsBeingExecuted.add(executor.submit(new Runnable() { @Override public void run() { try { List<String> args = new ArrayList<String>(); for (Param param : job.getParameters()) { String paramName = job.getName() + "." + param.getName(); args.add("-D"); Object val = bindings.get(paramName); if (val == null) { val = conf.get(paramName); if (val == null) { throw new RuntimeException("Unresolved parameter: " + paramName + " not present in bindings or Hadoop conf."); } } args.add(paramName + "=" + val); } for (Input input : job.getInputs()) { String inputName = job.getName() + "." + input.name; args.add("--" + input.name); String bindedTo = bindings.get(inputName); Step jOutput = jobOutputBindings.get(inputName); String outputBindedTo = bindings.get(bindedTo); if (outputBindedTo == null) { if (jOutput != null) { // sometimes we need to rewrite the path expression to avoid conflicts if (jOutput.namedOutputs.size() > 0) { if (bindedTo.endsWith(".output")) { // main output of a named output job // rebind to glob expression bindedTo = bindedTo + "/part*"; } else { // a named output // rebind to glob expression int lastPoint = bindedTo.lastIndexOf("."); String namedOutput = bindedTo.substring(lastPoint + 1, bindedTo.length()); bindedTo = bindedTo.substring(0, lastPoint) + "/" + namedOutput; } } } args.add(bindedTo); } else { args.add(outputBindedTo); } } args.add("--output"); // Output = outputName if it's not binded String bindedTo = bindings.get(job.getOutputName()); if (bindedTo == null) { bindedTo = job.getOutputName(); } args.add(bindedTo); if (mode.equals(EXECUTION_MODE.OVERWRITE)) { Path p = new Path(bindedTo); HadoopUtils.deleteIfExists(p.getFileSystem(conf), p); } Log.info("Executing [" + job.getName() + "], args: " + args); if (ToolRunner.run(conf, job, args.toArray(new String[0])) < 0) { throw new RuntimeException("Flow failed at step [" + job.getName() + "]"); } } catch (Throwable t) { t.printStackTrace(); flowFailed.set(true); } } }, job)); stepDependencies.remove(job); } } // Wait until some job finishes, whichever one Set<Future<Step>> stepsThatFinished = new HashSet<Future<Step>>(); while (stepsThatFinished.size() == 0) { Thread.sleep(1000); if (flowFailed.get()) { throw new RuntimeException("Flow failed!"); } for (Future<Step> stepBeingExecuted : stepsBeingExecuted) { if (stepBeingExecuted.isDone()) { Step doneStep = stepBeingExecuted.get(); Log.info("Step done: [" + doneStep + "]"); completedSteps.add(doneStep); stepsThatFinished.add(stepBeingExecuted); } } stepsBeingExecuted.removeAll(stepsThatFinished); } ; } // Wait until everything is finished // This is not very DRY - can it be improved? Set<Future<Step>> stepsThatFinished = new HashSet<Future<Step>>(); while (stepsBeingExecuted.size() > 0) { Thread.sleep(1000); if (flowFailed.get()) { throw new RuntimeException("Flow failed!"); } for (Future<Step> stepBeingExecuted : stepsBeingExecuted) { if (stepBeingExecuted.isDone()) { Step doneStep = stepBeingExecuted.get(); Log.info("Step done: [" + doneStep + "]"); stepsThatFinished.add(stepBeingExecuted); } } stepsBeingExecuted.removeAll(stepsThatFinished); } ; executor.shutdownNow(); }
From source file:com.datasalt.pangool.flow.Utils.java
License:Apache License
public static void delete(Path path, Configuration conf) throws IOException { HadoopUtils.deleteIfExists(path.getFileSystem(conf), path); }
From source file:com.datasalt.pangool.tuplemr.avro.AvroOutputFormat.java
License:Apache License
@Override public RecordWriter<AvroWrapper<T>, NullWritable> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException { final DataFileWriter<T> writer = new DataFileWriter<T>(new ReflectDatumWriter<T>()); configureDataFileWriter(writer, job, codecName, deflateLevel); Path path = getDefaultWorkFile(job, EXT); writer.create(getSchema(), path.getFileSystem(job.getConfiguration()).create(path)); return new RecordWriter<AvroWrapper<T>, NullWritable>() { @Override//from w ww.ja v a 2s.c o m public void write(AvroWrapper<T> wrapper, NullWritable ignore) throws IOException { writer.append(wrapper.datum()); } @Override public void close(TaskAttemptContext context) throws IOException { writer.close(); } }; }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.input.TupleFileRecordReader.java
License:Apache License
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { org.apache.hadoop.mapreduce.lib.input.FileSplit fileSplit = (org.apache.hadoop.mapreduce.lib.input.FileSplit) split; conf = context.getConfiguration();//from w ww. ja v a2 s . com Path path = fileSplit.getPath(); FileSystem fs = path.getFileSystem(conf); this.in = new TupleFile.Reader(fs, conf, path); this.end = fileSplit.getStart() + fileSplit.getLength(); if (fileSplit.getStart() > in.getPosition()) { in.sync(fileSplit.getStart()); } this.start = in.getPosition(); more = start < end; tuple = new Tuple(in.getSchema()); }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.TupleOutputFormat.java
License:Apache License
public RecordWriter<ITuple, NullWritable> getRecordWriter(final TaskAttemptContext context) throws IOException, InterruptedException { final Configuration conf = context.getConfiguration(); final CompressionCodec codec = getCodec(context); final SequenceFile.CompressionType compressionType = getCompressOutput(context) ? SequenceFileOutputFormat.getOutputCompressionType(context) : SequenceFile.CompressionType.NONE; // get the path of the temporary output file final Path file = getDefaultWorkFile(context, ""); final FileSystem fs = file.getFileSystem(conf); return new RecordWriter<ITuple, NullWritable>() { TupleFile.Writer out;/*from ww w . j ava 2s . c o m*/ public void write(ITuple key, NullWritable value) throws IOException { if (out == null) { if (outputSchema == null) { outputSchema = key.getSchema(); } out = new TupleFile.Writer(fs, conf, file, outputSchema, compressionType, codec, context); } out.append(key); } public void close(TaskAttemptContext context) throws IOException { out.close(); } }; }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.TupleTextOutputFormat.java
License:Apache License
@Override public RecordWriter<ITuple, NullWritable> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException { Path file = getDefaultWorkFile(context, ""); BufferedWriter writer = new BufferedWriter( new OutputStreamWriter(file.getFileSystem(context.getConfiguration()).create(file))); CSVWriter csvWriter = new CSVWriter(writer, separatorCharacter, quoteCharacter, escapeCharacter); if (addHeader) { String[] header = new String[schema.getFields().size()]; for (int i = 0; i < schema.getFields().size(); i++) { header[i] = schema.getFields().get(i).getName(); }// w ww. ja v a 2 s. co m csvWriter.writeNext(header); } return new TupleTextRecordWriter(schema, csvWriter); }
From source file:com.datasalt.utils.mapred.joiner.MultiJoiner.java
License:Apache License
private void addChanneledInputInner(Integer channel, Path location, Class<? extends Object> channelClass, Class<? extends InputFormat> inputFormat, Class<? extends MultiJoinChanneledMapper> mapper) throws IOException { FileSystem fS = location.getFileSystem(getJob().getConfiguration()); if (!location.toString().startsWith("/")) { // relative path location = new Path(fS.getWorkingDirectory(), location); } else {//from w w w .j av a 2 s . co m // absolute path location = new Path(fS.getUri() + location.toString()); } addInOrder(channel + "", MultiJoinChanneledMapper.MULTIJOINER_CHANNELED_CHANNELS, getJob().getConfiguration()); addInOrder(location.toString(), MultiJoinChanneledMapper.MULTIJOINER_CHANNELED_FILES, getJob().getConfiguration()); System.out.println("Adding file " + location + " with mapper " + mapper.getName()); MultipleInputs.addInputPath(getJob(), location, inputFormat, mapper); }
From source file:com.datascience.hadoop.CsvInputFormat.java
License:Apache License
@Override public RecordReader<LongWritable, ListWritable<Text>> getRecordReader(InputSplit inputSplit, JobConf conf, Reporter reporter) throws IOException { String charsetName = conf.get(CHARSET); Charset charset = charsetName != null ? Charset.forName(charsetName) : StandardCharsets.UTF_8; FileSplit split = (FileSplit) inputSplit; Path path = split.getPath(); FileSystem fs = path.getFileSystem(conf); InputStream is = fs.open(path); // If the input is compressed, load the compression codec. CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(path); if (codec != null) { Decompressor decompressor = CodecPool.getDecompressor(codec); is = codec.createInputStream(is, decompressor); }/*from w w w . j av a 2 s. c o m*/ return new CsvRecordReader(new InputStreamReader(is, charset), createFormat(conf), split.getLength(), conf.getBoolean(STRICT_MODE, true)); }
From source file:com.datascience.hadoop.CsvOutputFormat.java
License:Apache License
@Override public RecordWriter<LongWritable, ListWritable<Text>> getRecordWriter(FileSystem fileSystem, JobConf conf, String name, Progressable progress) throws IOException { String charsetName = conf.get(CHARSET); Charset charset = charsetName != null ? Charset.forName(charsetName) : StandardCharsets.UTF_8; Path path; if (FileOutputFormat.getCompressOutput(conf)) { Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(conf, GzipCodec.class); CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf); path = FileOutputFormat.getTaskOutputPath(conf, name + codec.getDefaultExtension()); } else {// ww w . j a va 2 s.c om path = FileOutputFormat.getTaskOutputPath(conf, name); } return new CsvRecordWriter(new OutputStreamWriter(path.getFileSystem(conf).create(path, progress), charset), createFormat(conf)); }
From source file:com.david.mos.out.FileOutputFormat.java
License:Apache License
public void checkOutputSpecs(JobContext job) throws FileAlreadyExistsException, IOException { // Ensure that the output directory is set and not already there Path outDir = getOutputPath(job); if (outDir == null) { throw new InvalidJobConfException("Output directory not set."); }/* w ww . j a v a 2 s . com*/ // get delegation token for outDir's file system TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { outDir }, job.getConfiguration()); if (outDir.getFileSystem(job.getConfiguration()).exists(outDir)) { throw new FileAlreadyExistsException("Output directory " + outDir + " already exists"); } }