Example usage for org.apache.hadoop.fs Path getFileSystem

List of usage examples for org.apache.hadoop.fs Path getFileSystem

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getFileSystem.

Prototype

public FileSystem getFileSystem(Configuration conf) throws IOException 

Source Link

Document

Return the FileSystem that owns this Path.

Usage

From source file:com.datasalt.pangool.flow.BaseFlow.java

License:Apache License

public void execute(final EXECUTION_MODE mode, final Configuration conf, String... outputs) throws Exception {
    List<Step> toResolve = new ArrayList<Step>();

    for (String output : outputs) {
        Step orig = findInOutputs(output);
        if (orig == null) {
            throw new IllegalArgumentException("Unknown output: " + output + " not found in flow context.");
        }//from   w ww . ja v  a2s .c  o  m
        toResolve.add(orig);
    }

    Step orig;

    final Map<String, Step> jobOutputBindings = new HashMap<String, Step>();
    final Map<Step, Set<Step>> stepDependencies = new HashMap<Step, Set<Step>>();

    while (toResolve.size() > 0) {
        Iterator<Step> it = toResolve.iterator();
        orig = it.next();
        it.remove();
        Log.info("Resolving dependencies for " + orig.getName());

        Set<Step> deps = new HashSet<Step>();
        for (Input input : orig.getInputs()) {
            String inputName = orig.getName() + "." + input.name;
            String bindedTo = bindings.get(inputName);

            if (bindedTo == null) {
                throw new IllegalArgumentException(
                        "Input " + inputName + " not binded to anything in current flow context.");
            }

            Step job = findInOutputs(bindedTo);
            if (job == null) {
                if (!inputs.contains(bindedTo)) {
                    throw new IllegalArgumentException("Unknown input: " + bindedTo + " binded to " + inputName
                            + " not found in flow context.");
                }
            } else {
                deps.add(job);
                jobOutputBindings.put(inputName, job);
                toResolve.add(job);
            }
        }

        stepDependencies.put(orig, deps);
    }

    Log.info("Steps to execute and dependencies: " + stepDependencies);
    Set<Step> completedSteps = new HashSet<Step>();
    ExecutorService executor = Executors.newCachedThreadPool();
    Set<Future<Step>> stepsBeingExecuted = new HashSet<Future<Step>>();
    final AtomicBoolean flowFailed = new AtomicBoolean(false);

    while (stepDependencies.keySet().size() > 0) {
        // gather all steps at this level
        // steps to be executed at each moment are steps whose dependencies are only dependencies that already have been
        // executed
        Set<Step> stepsToExecuteInParallel = new HashSet<Step>();
        for (Map.Entry<Step, Set<Step>> entry : stepDependencies.entrySet()) {
            boolean canBeExecuted = true;
            for (Step dependencyStep : entry.getValue()) {
                if (!completedSteps.contains(dependencyStep)) {
                    canBeExecuted = false;
                    break;
                }
            }
            if (canBeExecuted) {
                stepsToExecuteInParallel.add(entry.getKey());
            }
        }

        if (stepsToExecuteInParallel.size() > 0) {
            Log.info("Launching parallel steps [" + stepsToExecuteInParallel + "]");

            for (final Step job : stepsToExecuteInParallel) {
                stepsBeingExecuted.add(executor.submit(new Runnable() {
                    @Override
                    public void run() {
                        try {
                            List<String> args = new ArrayList<String>();
                            for (Param param : job.getParameters()) {
                                String paramName = job.getName() + "." + param.getName();
                                args.add("-D");
                                Object val = bindings.get(paramName);
                                if (val == null) {
                                    val = conf.get(paramName);
                                    if (val == null) {
                                        throw new RuntimeException("Unresolved parameter: " + paramName
                                                + " not present in bindings or Hadoop conf.");
                                    }
                                }
                                args.add(paramName + "=" + val);
                            }
                            for (Input input : job.getInputs()) {
                                String inputName = job.getName() + "." + input.name;
                                args.add("--" + input.name);
                                String bindedTo = bindings.get(inputName);
                                Step jOutput = jobOutputBindings.get(inputName);
                                String outputBindedTo = bindings.get(bindedTo);
                                if (outputBindedTo == null) {
                                    if (jOutput != null) {
                                        // sometimes we need to rewrite the path expression to avoid conflicts
                                        if (jOutput.namedOutputs.size() > 0) {
                                            if (bindedTo.endsWith(".output")) { // main output of a named output job
                                                // rebind to glob expression
                                                bindedTo = bindedTo + "/part*";
                                            } else { // a named output
                                                // rebind to glob expression
                                                int lastPoint = bindedTo.lastIndexOf(".");
                                                String namedOutput = bindedTo.substring(lastPoint + 1,
                                                        bindedTo.length());
                                                bindedTo = bindedTo.substring(0, lastPoint) + "/" + namedOutput;
                                            }
                                        }
                                    }
                                    args.add(bindedTo);
                                } else {
                                    args.add(outputBindedTo);
                                }
                            }
                            args.add("--output");
                            // Output = outputName if it's not binded
                            String bindedTo = bindings.get(job.getOutputName());
                            if (bindedTo == null) {
                                bindedTo = job.getOutputName();
                            }
                            args.add(bindedTo);
                            if (mode.equals(EXECUTION_MODE.OVERWRITE)) {
                                Path p = new Path(bindedTo);
                                HadoopUtils.deleteIfExists(p.getFileSystem(conf), p);
                            }
                            Log.info("Executing [" + job.getName() + "], args: " + args);
                            if (ToolRunner.run(conf, job, args.toArray(new String[0])) < 0) {
                                throw new RuntimeException("Flow failed at step [" + job.getName() + "]");
                            }
                        } catch (Throwable t) {
                            t.printStackTrace();
                            flowFailed.set(true);
                        }
                    }
                }, job));
                stepDependencies.remove(job);
            }
        }

        // Wait until some job finishes, whichever one
        Set<Future<Step>> stepsThatFinished = new HashSet<Future<Step>>();

        while (stepsThatFinished.size() == 0) {
            Thread.sleep(1000);

            if (flowFailed.get()) {
                throw new RuntimeException("Flow failed!");
            }

            for (Future<Step> stepBeingExecuted : stepsBeingExecuted) {
                if (stepBeingExecuted.isDone()) {
                    Step doneStep = stepBeingExecuted.get();
                    Log.info("Step done: [" + doneStep + "]");
                    completedSteps.add(doneStep);
                    stepsThatFinished.add(stepBeingExecuted);
                }
            }

            stepsBeingExecuted.removeAll(stepsThatFinished);
        }
        ;
    }

    // Wait until everything is finished
    // This is not very DRY - can it be improved?
    Set<Future<Step>> stepsThatFinished = new HashSet<Future<Step>>();

    while (stepsBeingExecuted.size() > 0) {
        Thread.sleep(1000);

        if (flowFailed.get()) {
            throw new RuntimeException("Flow failed!");
        }

        for (Future<Step> stepBeingExecuted : stepsBeingExecuted) {
            if (stepBeingExecuted.isDone()) {
                Step doneStep = stepBeingExecuted.get();
                Log.info("Step done: [" + doneStep + "]");
                stepsThatFinished.add(stepBeingExecuted);
            }
        }

        stepsBeingExecuted.removeAll(stepsThatFinished);
    }
    ;

    executor.shutdownNow();
}

From source file:com.datasalt.pangool.flow.Utils.java

License:Apache License

public static void delete(Path path, Configuration conf) throws IOException {
    HadoopUtils.deleteIfExists(path.getFileSystem(conf), path);
}

From source file:com.datasalt.pangool.tuplemr.avro.AvroOutputFormat.java

License:Apache License

@Override
public RecordWriter<AvroWrapper<T>, NullWritable> getRecordWriter(TaskAttemptContext job)
        throws IOException, InterruptedException {

    final DataFileWriter<T> writer = new DataFileWriter<T>(new ReflectDatumWriter<T>());

    configureDataFileWriter(writer, job, codecName, deflateLevel);
    Path path = getDefaultWorkFile(job, EXT);
    writer.create(getSchema(), path.getFileSystem(job.getConfiguration()).create(path));

    return new RecordWriter<AvroWrapper<T>, NullWritable>() {
        @Override//from w  ww.ja  v a 2s.c  o m
        public void write(AvroWrapper<T> wrapper, NullWritable ignore) throws IOException {
            writer.append(wrapper.datum());
        }

        @Override
        public void close(TaskAttemptContext context) throws IOException {
            writer.close();
        }
    };
}

From source file:com.datasalt.pangool.tuplemr.mapred.lib.input.TupleFileRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    org.apache.hadoop.mapreduce.lib.input.FileSplit fileSplit = (org.apache.hadoop.mapreduce.lib.input.FileSplit) split;
    conf = context.getConfiguration();//from  w ww.  ja  v  a2  s . com
    Path path = fileSplit.getPath();
    FileSystem fs = path.getFileSystem(conf);
    this.in = new TupleFile.Reader(fs, conf, path);
    this.end = fileSplit.getStart() + fileSplit.getLength();

    if (fileSplit.getStart() > in.getPosition()) {
        in.sync(fileSplit.getStart());
    }

    this.start = in.getPosition();
    more = start < end;

    tuple = new Tuple(in.getSchema());
}

From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.TupleOutputFormat.java

License:Apache License

public RecordWriter<ITuple, NullWritable> getRecordWriter(final TaskAttemptContext context)
        throws IOException, InterruptedException {

    final Configuration conf = context.getConfiguration();

    final CompressionCodec codec = getCodec(context);
    final SequenceFile.CompressionType compressionType = getCompressOutput(context)
            ? SequenceFileOutputFormat.getOutputCompressionType(context)
            : SequenceFile.CompressionType.NONE;
    // get the path of the temporary output file
    final Path file = getDefaultWorkFile(context, "");
    final FileSystem fs = file.getFileSystem(conf);

    return new RecordWriter<ITuple, NullWritable>() {

        TupleFile.Writer out;/*from  ww  w . j ava 2s  .  c o m*/

        public void write(ITuple key, NullWritable value) throws IOException {
            if (out == null) {
                if (outputSchema == null) {
                    outputSchema = key.getSchema();
                }
                out = new TupleFile.Writer(fs, conf, file, outputSchema, compressionType, codec, context);
            }
            out.append(key);
        }

        public void close(TaskAttemptContext context) throws IOException {
            out.close();
        }
    };
}

From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.TupleTextOutputFormat.java

License:Apache License

@Override
public RecordWriter<ITuple, NullWritable> getRecordWriter(TaskAttemptContext context)
        throws IOException, InterruptedException {

    Path file = getDefaultWorkFile(context, "");
    BufferedWriter writer = new BufferedWriter(
            new OutputStreamWriter(file.getFileSystem(context.getConfiguration()).create(file)));
    CSVWriter csvWriter = new CSVWriter(writer, separatorCharacter, quoteCharacter, escapeCharacter);
    if (addHeader) {
        String[] header = new String[schema.getFields().size()];
        for (int i = 0; i < schema.getFields().size(); i++) {
            header[i] = schema.getFields().get(i).getName();
        }// w  ww.  ja  v a  2  s.  co  m
        csvWriter.writeNext(header);
    }
    return new TupleTextRecordWriter(schema, csvWriter);
}

From source file:com.datasalt.utils.mapred.joiner.MultiJoiner.java

License:Apache License

private void addChanneledInputInner(Integer channel, Path location, Class<? extends Object> channelClass,
        Class<? extends InputFormat> inputFormat, Class<? extends MultiJoinChanneledMapper> mapper)
        throws IOException {

    FileSystem fS = location.getFileSystem(getJob().getConfiguration());
    if (!location.toString().startsWith("/")) {
        // relative path
        location = new Path(fS.getWorkingDirectory(), location);
    } else {//from   w  w w .j  av a  2 s  .  co  m
        // absolute path
        location = new Path(fS.getUri() + location.toString());
    }
    addInOrder(channel + "", MultiJoinChanneledMapper.MULTIJOINER_CHANNELED_CHANNELS,
            getJob().getConfiguration());
    addInOrder(location.toString(), MultiJoinChanneledMapper.MULTIJOINER_CHANNELED_FILES,
            getJob().getConfiguration());
    System.out.println("Adding file " + location + " with mapper " + mapper.getName());
    MultipleInputs.addInputPath(getJob(), location, inputFormat, mapper);
}

From source file:com.datascience.hadoop.CsvInputFormat.java

License:Apache License

@Override
public RecordReader<LongWritable, ListWritable<Text>> getRecordReader(InputSplit inputSplit, JobConf conf,
        Reporter reporter) throws IOException {
    String charsetName = conf.get(CHARSET);
    Charset charset = charsetName != null ? Charset.forName(charsetName) : StandardCharsets.UTF_8;

    FileSplit split = (FileSplit) inputSplit;
    Path path = split.getPath();
    FileSystem fs = path.getFileSystem(conf);
    InputStream is = fs.open(path);

    // If the input is compressed, load the compression codec.
    CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
    CompressionCodec codec = codecFactory.getCodec(path);
    if (codec != null) {
        Decompressor decompressor = CodecPool.getDecompressor(codec);
        is = codec.createInputStream(is, decompressor);
    }/*from  w w  w  . j av a 2  s.  c  o  m*/
    return new CsvRecordReader(new InputStreamReader(is, charset), createFormat(conf), split.getLength(),
            conf.getBoolean(STRICT_MODE, true));
}

From source file:com.datascience.hadoop.CsvOutputFormat.java

License:Apache License

@Override
public RecordWriter<LongWritable, ListWritable<Text>> getRecordWriter(FileSystem fileSystem, JobConf conf,
        String name, Progressable progress) throws IOException {
    String charsetName = conf.get(CHARSET);
    Charset charset = charsetName != null ? Charset.forName(charsetName) : StandardCharsets.UTF_8;

    Path path;
    if (FileOutputFormat.getCompressOutput(conf)) {
        Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(conf,
                GzipCodec.class);
        CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);
        path = FileOutputFormat.getTaskOutputPath(conf, name + codec.getDefaultExtension());
    } else {//  ww  w  . j  a  va  2 s.c om
        path = FileOutputFormat.getTaskOutputPath(conf, name);
    }
    return new CsvRecordWriter(new OutputStreamWriter(path.getFileSystem(conf).create(path, progress), charset),
            createFormat(conf));
}

From source file:com.david.mos.out.FileOutputFormat.java

License:Apache License

public void checkOutputSpecs(JobContext job) throws FileAlreadyExistsException, IOException {
    // Ensure that the output directory is set and not already there
    Path outDir = getOutputPath(job);
    if (outDir == null) {
        throw new InvalidJobConfException("Output directory not set.");
    }/*  w  ww .  j a v  a 2 s .  com*/

    // get delegation token for outDir's file system
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { outDir }, job.getConfiguration());

    if (outDir.getFileSystem(job.getConfiguration()).exists(outDir)) {
        throw new FileAlreadyExistsException("Output directory " + outDir + " already exists");
    }
}