Example usage for org.apache.hadoop.fs Path Path

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path Path.

Prototype

public Path(URI aUri)

Source Link

Document

Construct a path from a URI

Usage

From source file:ai.grakn.graph.internal.computer.GraknSparkComputer.java

License:Open Source License

public GraknSparkComputer(final HadoopGraph hadoopGraph) {
    super(hadoopGraph);
    this.sparkConfiguration = new HadoopConfiguration();
    ConfigurationUtils.copy(this.hadoopGraph.configuration(), this.sparkConfiguration);

    this.apacheConfiguration = new HadoopConfiguration(this.sparkConfiguration);
    apacheConfiguration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_OUTPUT_FORMAT_HAS_EDGES, false);
    hadoopConfiguration = ConfUtil.makeHadoopConfiguration(apacheConfiguration);

    if (hadoopConfiguration.get(Constants.GREMLIN_SPARK_GRAPH_INPUT_RDD, null) == null
            && hadoopConfiguration.get(Constants.GREMLIN_HADOOP_GRAPH_INPUT_FORMAT, null) != null
            && FileInputFormat.class.isAssignableFrom(hadoopConfiguration
                    .getClass(Constants.GREMLIN_HADOOP_GRAPH_INPUT_FORMAT, InputFormat.class))) {
        try {/*  ww w.ja  va 2s.c  o  m*/
            final String inputLocation = FileSystem.get(hadoopConfiguration)
                    .getFileStatus(new Path(hadoopConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION)))
                    .getPath().toString();
            apacheConfiguration.setProperty(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR, inputLocation);
            hadoopConfiguration.set(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR, inputLocation);
        } catch (final IOException e) {
            throw new IllegalStateException(e.getMessage(), e);
        }
    }
}

From source file:ai.grakn.kb.internal.computer.GraknSparkComputer.java

License:Open Source License

@SuppressWarnings("PMD.UnusedFormalParameter")
private Future<ComputerResult> submitWithExecutor() {
    jobGroupId = Integer.toString(ThreadLocalRandom.current().nextInt(Integer.MAX_VALUE));
    String jobDescription = this.vertexProgram == null ? this.mapReducers.toString()
            : this.vertexProgram + "+" + this.mapReducers;

    // Use different output locations
    this.sparkConfiguration.setProperty(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION,
            this.sparkConfiguration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION) + "/" + jobGroupId);

    updateConfigKeys(sparkConfiguration);

    final Future<ComputerResult> result = computerService.submit(() -> {
        final long startTime = System.currentTimeMillis();

        // apache and hadoop configurations that are used throughout the graph computer computation
        final org.apache.commons.configuration.Configuration graphComputerConfiguration = new HadoopConfiguration(
                this.sparkConfiguration);
        if (!graphComputerConfiguration.containsKey(Constants.SPARK_SERIALIZER)) {
            graphComputerConfiguration.setProperty(Constants.SPARK_SERIALIZER,
                    GryoSerializer.class.getCanonicalName());
        }//ww w  .  j  a v a 2 s  .  co m
        graphComputerConfiguration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_WRITER_HAS_EDGES,
                this.persist.equals(GraphComputer.Persist.EDGES));

        final Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(graphComputerConfiguration);

        final Storage fileSystemStorage = FileSystemStorage.open(hadoopConfiguration);
        final boolean inputFromHDFS = FileInputFormat.class.isAssignableFrom(
                hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class));
        final boolean inputFromSpark = PersistedInputRDD.class.isAssignableFrom(
                hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class));
        final boolean outputToHDFS = FileOutputFormat.class.isAssignableFrom(
                hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class));
        final boolean outputToSpark = PersistedOutputRDD.class.isAssignableFrom(
                hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class));
        final boolean skipPartitioner = graphComputerConfiguration
                .getBoolean(Constants.GREMLIN_SPARK_SKIP_PARTITIONER, false);
        final boolean skipPersist = graphComputerConfiguration
                .getBoolean(Constants.GREMLIN_SPARK_SKIP_GRAPH_CACHE, false);

        if (inputFromHDFS) {
            String inputLocation = Constants
                    .getSearchGraphLocation(hadoopConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION),
                            fileSystemStorage)
                    .orElse(null);
            if (null != inputLocation) {
                try {
                    graphComputerConfiguration.setProperty(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR,
                            FileSystem.get(hadoopConfiguration).getFileStatus(new Path(inputLocation)).getPath()
                                    .toString());
                    hadoopConfiguration.set(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR,
                            FileSystem.get(hadoopConfiguration).getFileStatus(new Path(inputLocation)).getPath()
                                    .toString());
                } catch (final IOException e) {
                    throw new IllegalStateException(e.getMessage(), e);
                }
            }
        }

        final InputRDD inputRDD;
        final OutputRDD outputRDD;
        final boolean filtered;
        try {
            inputRDD = InputRDD.class.isAssignableFrom(
                    hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class))
                            ? hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER,
                                    InputRDD.class, InputRDD.class).newInstance()
                            : InputFormatRDD.class.newInstance();
            outputRDD = OutputRDD.class.isAssignableFrom(
                    hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class))
                            ? hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER,
                                    OutputRDD.class, OutputRDD.class).newInstance()
                            : OutputFormatRDD.class.newInstance();

            // if the input class can filter on load, then set the filters
            if (inputRDD instanceof InputFormatRDD
                    && GraphFilterAware.class.isAssignableFrom(hadoopConfiguration.getClass(
                            Constants.GREMLIN_HADOOP_GRAPH_READER, InputFormat.class, InputFormat.class))) {
                GraphFilterAware.storeGraphFilter(graphComputerConfiguration, hadoopConfiguration,
                        this.graphFilter);
                filtered = false;
            } else if (inputRDD instanceof GraphFilterAware) {
                ((GraphFilterAware) inputRDD).setGraphFilter(this.graphFilter);
                filtered = false;
            } else
                filtered = this.graphFilter.hasFilter();
        } catch (final InstantiationException | IllegalAccessException e) {
            throw new IllegalStateException(e.getMessage(), e);
        }

        // create the spark context from the graph computer configuration
        final JavaSparkContext sparkContext = new JavaSparkContext(Spark.create(hadoopConfiguration));
        final Storage sparkContextStorage = SparkContextStorage.open();

        sparkContext.setJobGroup(jobGroupId, jobDescription);

        GraknSparkMemory memory = null;
        // delete output location
        final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, null);
        if (null != outputLocation) {
            if (outputToHDFS && fileSystemStorage.exists(outputLocation)) {
                fileSystemStorage.rm(outputLocation);
            }
            if (outputToSpark && sparkContextStorage.exists(outputLocation)) {
                sparkContextStorage.rm(outputLocation);
            }
        }

        // the Spark application name will always be set by SparkContextStorage,
        // thus, INFO the name to make it easier to debug
        logger.debug(Constants.GREMLIN_HADOOP_SPARK_JOB_PREFIX
                + (null == this.vertexProgram ? "No VertexProgram" : this.vertexProgram) + "["
                + this.mapReducers + "]");

        // add the project jars to the cluster
        this.loadJars(hadoopConfiguration, sparkContext);
        updateLocalConfiguration(sparkContext, hadoopConfiguration);

        // create a message-passing friendly rdd from the input rdd
        boolean partitioned = false;
        JavaPairRDD<Object, VertexWritable> loadedGraphRDD = inputRDD.readGraphRDD(graphComputerConfiguration,
                sparkContext);

        // if there are vertex or edge filters, filter the loaded graph rdd prior to partitioning and persisting
        if (filtered) {
            this.logger.debug("Filtering the loaded graphRDD: " + this.graphFilter);
            loadedGraphRDD = GraknSparkExecutor.applyGraphFilter(loadedGraphRDD, this.graphFilter);
        }
        // if the loaded graph RDD is already partitioned use that partitioner,
        // else partition it with HashPartitioner
        if (loadedGraphRDD.partitioner().isPresent()) {
            this.logger.debug("Using the existing partitioner associated with the loaded graphRDD: "
                    + loadedGraphRDD.partitioner().get());
        } else {
            if (!skipPartitioner) {
                final Partitioner partitioner = new HashPartitioner(
                        this.workersSet ? this.workers : loadedGraphRDD.partitions().size());
                this.logger.debug("Partitioning the loaded graphRDD: " + partitioner);
                loadedGraphRDD = loadedGraphRDD.partitionBy(partitioner);
                partitioned = true;
                assert loadedGraphRDD.partitioner().isPresent();
            } else {
                // no easy way to test this with a test case
                assert skipPartitioner == !loadedGraphRDD.partitioner().isPresent();

                this.logger.debug("Partitioning has been skipped for the loaded graphRDD via "
                        + Constants.GREMLIN_SPARK_SKIP_PARTITIONER);
            }
        }
        // if the loaded graphRDD was already partitioned previous,
        // then this coalesce/repartition will not take place
        if (this.workersSet) {
            // ensures that the loaded graphRDD does not have more partitions than workers
            if (loadedGraphRDD.partitions().size() > this.workers) {
                loadedGraphRDD = loadedGraphRDD.coalesce(this.workers);
            } else {
                // ensures that the loaded graphRDD does not have less partitions than workers
                if (loadedGraphRDD.partitions().size() < this.workers) {
                    loadedGraphRDD = loadedGraphRDD.repartition(this.workers);
                }
            }
        }
        // persist the vertex program loaded graph as specified by configuration
        // or else use default cache() which is MEMORY_ONLY
        if (!skipPersist && (!inputFromSpark || partitioned || filtered)) {
            loadedGraphRDD = loadedGraphRDD.persist(StorageLevel.fromString(
                    hadoopConfiguration.get(Constants.GREMLIN_SPARK_GRAPH_STORAGE_LEVEL, "MEMORY_ONLY")));
        }
        // final graph with view
        // (for persisting and/or mapReducing -- may be null and thus, possible to save space/time)
        JavaPairRDD<Object, VertexWritable> computedGraphRDD = null;
        try {
            ////////////////////////////////
            // process the vertex program //
            ////////////////////////////////
            if (null != this.vertexProgram) {
                memory = new GraknSparkMemory(this.vertexProgram, this.mapReducers, sparkContext);
                /////////////////
                // if there is a registered VertexProgramInterceptor, use it to bypass the GraphComputer semantics
                if (graphComputerConfiguration
                        .containsKey(Constants.GREMLIN_HADOOP_VERTEX_PROGRAM_INTERCEPTOR)) {
                    try {
                        final GraknSparkVertexProgramInterceptor<VertexProgram> interceptor = (GraknSparkVertexProgramInterceptor) Class
                                .forName(graphComputerConfiguration
                                        .getString(Constants.GREMLIN_HADOOP_VERTEX_PROGRAM_INTERCEPTOR))
                                .newInstance();
                        computedGraphRDD = interceptor.apply(this.vertexProgram, loadedGraphRDD, memory);
                    } catch (final ClassNotFoundException | IllegalAccessException | InstantiationException e) {
                        throw new IllegalStateException(e.getMessage());
                    }
                } else {
                    // standard GraphComputer semantics
                    // get a configuration that will be propagated to all workers
                    final HadoopConfiguration vertexProgramConfiguration = new HadoopConfiguration();
                    this.vertexProgram.storeState(vertexProgramConfiguration);
                    // set up the vertex program and wire up configurations
                    this.vertexProgram.setup(memory);
                    JavaPairRDD<Object, ViewIncomingPayload<Object>> viewIncomingRDD = null;
                    memory.broadcastMemory(sparkContext);
                    // execute the vertex program
                    while (true) {
                        if (Thread.interrupted()) {
                            sparkContext.cancelAllJobs();
                            throw new TraversalInterruptedException();
                        }
                        memory.setInExecute(true);
                        viewIncomingRDD = GraknSparkExecutor.executeVertexProgramIteration(loadedGraphRDD,
                                viewIncomingRDD, memory, graphComputerConfiguration,
                                vertexProgramConfiguration);
                        memory.setInExecute(false);
                        if (this.vertexProgram.terminate(memory)) {
                            break;
                        } else {
                            memory.incrIteration();
                            memory.broadcastMemory(sparkContext);
                        }
                    }
                    // if the graph will be continued to be used (persisted or mapreduced),
                    // then generate a view+graph
                    if ((null != outputRDD && !this.persist.equals(Persist.NOTHING))
                            || !this.mapReducers.isEmpty()) {
                        computedGraphRDD = GraknSparkExecutor.prepareFinalGraphRDD(loadedGraphRDD,
                                viewIncomingRDD, this.vertexProgram.getVertexComputeKeys());
                        assert null != computedGraphRDD && computedGraphRDD != loadedGraphRDD;
                    } else {
                        // ensure that the computedGraphRDD was not created
                        assert null == computedGraphRDD;
                    }
                }
                /////////////////
                memory.complete(); // drop all transient memory keys
                // write the computed graph to the respective output (rdd or output format)
                if (null != outputRDD && !this.persist.equals(Persist.NOTHING)) {
                    // the logic holds that a computeGraphRDD must be created at this point
                    assert null != computedGraphRDD;

                    outputRDD.writeGraphRDD(graphComputerConfiguration, computedGraphRDD);
                }
            }

            final boolean computedGraphCreated = computedGraphRDD != null && computedGraphRDD != loadedGraphRDD;
            if (!computedGraphCreated) {
                computedGraphRDD = loadedGraphRDD;
            }

            final Memory.Admin finalMemory = null == memory ? new MapMemory() : new MapMemory(memory);

            //////////////////////////////
            // process the map reducers //
            //////////////////////////////
            if (!this.mapReducers.isEmpty()) {
                // create a mapReduceRDD for executing the map reduce jobs on
                JavaPairRDD<Object, VertexWritable> mapReduceRDD = computedGraphRDD;
                if (computedGraphCreated && !outputToSpark) {
                    // drop all the edges of the graph as they are not used in mapReduce processing
                    mapReduceRDD = computedGraphRDD.mapValues(vertexWritable -> {
                        vertexWritable.get().dropEdges(Direction.BOTH);
                        return vertexWritable;
                    });
                    // if there is only one MapReduce to execute, don't bother wasting the clock cycles.
                    if (this.mapReducers.size() > 1) {
                        mapReduceRDD = mapReduceRDD.persist(StorageLevel.fromString(hadoopConfiguration
                                .get(Constants.GREMLIN_SPARK_GRAPH_STORAGE_LEVEL, "MEMORY_ONLY")));
                    }
                }

                for (final MapReduce mapReduce : this.mapReducers) {
                    // execute the map reduce job
                    final HadoopConfiguration newApacheConfiguration = new HadoopConfiguration(
                            graphComputerConfiguration);
                    mapReduce.storeState(newApacheConfiguration);
                    // map
                    final JavaPairRDD mapRDD = GraknSparkExecutor.executeMap(mapReduceRDD, mapReduce,
                            newApacheConfiguration);
                    // combine
                    final JavaPairRDD combineRDD = mapReduce.doStage(MapReduce.Stage.COMBINE)
                            ? GraknSparkExecutor.executeCombine(mapRDD, newApacheConfiguration)
                            : mapRDD;
                    // reduce
                    final JavaPairRDD reduceRDD = mapReduce.doStage(MapReduce.Stage.REDUCE)
                            ? GraknSparkExecutor.executeReduce(combineRDD, mapReduce, newApacheConfiguration)
                            : combineRDD;
                    // write the map reduce output back to disk and computer result memory
                    if (null != outputRDD) {
                        mapReduce.addResultToMemory(finalMemory, outputRDD.writeMemoryRDD(
                                graphComputerConfiguration, mapReduce.getMemoryKey(), reduceRDD));
                    }
                }
                // if the mapReduceRDD is not simply the computed graph, unpersist the mapReduceRDD
                if (computedGraphCreated && !outputToSpark) {
                    assert loadedGraphRDD != computedGraphRDD;
                    assert mapReduceRDD != computedGraphRDD;
                    mapReduceRDD.unpersist();
                } else {
                    assert mapReduceRDD == computedGraphRDD;
                }
            }

            // unpersist the loaded graph if it will not be used again (no PersistedInputRDD)
            // if the graphRDD was loaded from Spark, but then partitioned or filtered, its a different RDD
            if (!inputFromSpark || partitioned || filtered) {
                loadedGraphRDD.unpersist();
            }
            // unpersist the computed graph if it will not be used again (no PersistedOutputRDD)
            // if the computed graph is the loadedGraphRDD because it was not mutated and not-unpersisted,
            // then don't unpersist the computedGraphRDD/loadedGraphRDD
            if ((!outputToSpark || this.persist.equals(GraphComputer.Persist.NOTHING))
                    && computedGraphCreated) {
                computedGraphRDD.unpersist();
            }
            // delete any file system or rdd data if persist nothing
            if (null != outputLocation && this.persist.equals(GraphComputer.Persist.NOTHING)) {
                if (outputToHDFS) {
                    fileSystemStorage.rm(outputLocation);
                }
                if (outputToSpark) {
                    sparkContextStorage.rm(outputLocation);
                }
            }
            // update runtime and return the newly computed graph
            finalMemory.setRuntime(System.currentTimeMillis() - startTime);
            // clear properties that should not be propagated in an OLAP chain
            graphComputerConfiguration.clearProperty(Constants.GREMLIN_HADOOP_GRAPH_FILTER);
            graphComputerConfiguration.clearProperty(Constants.GREMLIN_HADOOP_VERTEX_PROGRAM_INTERCEPTOR);
            graphComputerConfiguration.clearProperty(Constants.GREMLIN_SPARK_SKIP_GRAPH_CACHE);
            graphComputerConfiguration.clearProperty(Constants.GREMLIN_SPARK_SKIP_PARTITIONER);
            return new DefaultComputerResult(InputOutputHelper.getOutputGraph(graphComputerConfiguration,
                    this.resultGraph, this.persist), finalMemory.asImmutable());
        } catch (Exception e) {
            // So it throws the same exception as tinker does
            throw new RuntimeException(e);
        }
    });
    computerService.shutdown();
    return result;
}

From source file:AllLab_Skeleton.Lab1.Lab1_Wordcount.java

/**
 * @param args the command line arguments
 *///from  w w w. java 2s  . c  om
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "word count");
    job.setJarByClass(Lab1_Wordcount.class);
    job.setMapperClass(WordCount_Mapper.class);
    job.setCombinerClass(WordCount_Reducer.class);
    job.setReducerClass(WordCount_Reducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:AllLab_Skeleton.Lab2.Lab2SecondarySort.java

/**
 * @param args the command line arguments
 *///from  w  w  w  .  jav a  2s  .c om
public static void main(String[] args) {

    try {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "SecondarySort");
        job.setJarByClass(Lab2SecondarySort.class);

        job.setMapperClass(Lab2Mapper.class);
        job.setMapOutputKeyClass(CompositeKeyWritable.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setPartitionerClass(Lab2Partitioner.class);
        job.setGroupingComparatorClass(Lab2GroupComparator.class);

        job.setReducerClass(Lab2Reducer.class);
        job.setOutputKeyClass(CompositeKeyWritable.class);
        job.setOutputValueClass(NullWritable.class);

        job.setNumReduceTasks(8);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    } catch (IOException | InterruptedException | ClassNotFoundException ex) {
        System.out.println("Erorr Message" + ex.getMessage());
    }
}

From source file:AllLab_Skeleton.Lab4.Lab4_Std_dev.java

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "medianstd");
    job.setJarByClass(Lab4_Std_dev.class);
    job.setMapperClass(Map.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setReducerClass(Reduce.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(MedianSDCustomWritable.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    System.exit(job.waitForCompletion(true) ? 0 : 1);

}

From source file:AllLab_Skeleton.Lab6.BloomFilterBhavesh.java

public static void main(String[] args) throws Exception {

    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "Bloom Filter");
    job.setJarByClass(BloomFilterBhavesh.class);
    job.setMapperClass(BloomFilterMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setNumReduceTasks(0);/* ww  w.j av a2  s  .  c o m*/
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    boolean success = job.waitForCompletion(true);
    System.out.println(success);

}

From source file:AllLab_Skeleton.Lab6.BloomFilterUsingDistributedCache.java

public static void main(String[] args) throws Exception {

    Configuration conf = new Configuration();

    Job job = Job.getInstance(conf, "Bloom Filter");
    job.setJarByClass(BloomFilterUsingDistributedCache.class);
    job.setMapperClass(BloomFilterMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(NullWritable.class);

    //adding the file in the cache having the Person class records
    //job.addCacheFile(new Path("localhost:9000/bhavesh/LabAssignment/CacheInput/cache.txt").toUri());
    DistributedCache.addCacheFile(new URI(args[2]), job.getConfiguration());
    job.setNumReduceTasks(0);//from w ww.  j  av a2  s  .c  o m

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);

}

From source file:AllLab_Skeleton.Lab6.ReduceSideJoin.java

public static void main(String[] args) throws Exception {

    Configuration conf = new Configuration();

    Job job = Job.getInstance(conf, "ReduceSideJoin");
    job.setJarByClass(ReduceSideJoin.class);

    // Use MultipleInputs to set which input uses what mapper
    // This will keep parsing of each data set separate from a logical
    // standpoint
    // The first two elements of the args array are the two inputs
    MultipleInputs.addInputPath(job, new Path(args[0]), TextInputFormat.class, UserJoinMapper.class);
    MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, CommentJoinMapper.class);
    job.getConfiguration().set("join.type", "leftouter");
    //job.setNumReduceTasks(0);
    job.setReducerClass(UserJoinReducer.class);

    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, new Path(args[2]));

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.waitForCompletion(true);/*from ww w  .j a va 2  s  .c o m*/
}

From source file:alluxio.checker.MapReduceIntegrationChecker.java

License:Apache License

/**
 * Creates the HDFS filesystem to store output files.
 *
 * @param conf Hadoop configuration/* w w w  .ja va  2 s  . c  o  m*/
 */
private void createHdfsFilesystem(Configuration conf) throws Exception {
    // Inits HDFS file system object
    mFileSystem = FileSystem.get(URI.create(conf.get("fs.defaultFS")), conf);
    mOutputFilePath = new Path("./MapReduceOutputFile");
    if (mFileSystem.exists(mOutputFilePath)) {
        mFileSystem.delete(mOutputFilePath, true);
    }
}

From source file:alluxio.client.hadoop.contract.FileSystemContract.java

License:Apache License

@Override
public Path getTestPath() {
    Path path = mFS.makeQualified(new Path(getTestDataDir()));
    return path;
}