Example usage for org.apache.hadoop.mapred JobConf setOutputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputFormat.

Prototype

public void setOutputFormat(Class<? extends OutputFormat> theClass)

Source Link

Document

Set the OutputFormat implementation for the map-reduce job.

Usage

From source file:edu.umn.cs.spatialHadoop.temporal.RepartitionTemporal.java

License:Apache License

public static void repartitionMapReduce(Path[] inputPaths, Path outputPath, OperationsParams params)
        throws IOException, InterruptedException {
    String sindex = params.get("sindex");
    boolean overwrite = params.getBoolean("overwrite", false);
    Shape stockShape = params.getShape("shape");

    FileSystem outFs = outputPath.getFileSystem(params);

    @SuppressWarnings("deprecation")
    final long blockSize = outFs.getDefaultBlockSize();

    // Calculate the dimensions of each partition based on gindex type
    CellInfo[] cellInfos;/* ww  w .  ja  v a2  s .c o  m*/
    if (sindex.equals("grid")) {
        Rectangle inputMBR = FileMBR.fileMBR(inputPaths[0], params);
        long inputFileSize = FileMBR.sizeOfLastProcessedFile;
        for (int i = 1; i < inputPaths.length; i++) {
            Rectangle currentInputMBR = FileMBR.fileMBR(inputPaths[i], params);
            inputMBR.expand(currentInputMBR);
            inputFileSize = inputFileSize + FileMBR.sizeOfLastProcessedFile;
        }

        int num_partitions = calculateNumberOfPartitions(new Configuration(), inputFileSize, outFs, outputPath,
                blockSize);

        GridInfo gridInfo = new GridInfo(inputMBR.x1, inputMBR.y1, inputMBR.x2, inputMBR.y2);
        gridInfo.calculateCellDimensions(num_partitions);
        cellInfos = gridInfo.getAllCells();
    } else if (sindex.equals("rtree") || sindex.equals("r+tree") || sindex.equals("str")
            || sindex.equals("str+")) {
        // Pack in rectangles using an RTree
        cellInfos = packInRectangles(inputPaths, outputPath, params, null);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    JobConf job = new JobConf(params, RepartitionTemporal.class);
    job.setJobName("RepartitionTemporal");

    // Overwrite output file
    if (outFs.exists(outputPath)) {
        if (overwrite)
            outFs.delete(outputPath, true);
        else
            throw new RuntimeException(
                    "Output file '" + outputPath + "' already exists and overwrite flag is not set");
    }

    // Decide which map function to use depending on the type of global
    // index
    if (sindex.equals("rtree") || sindex.equals("str")) {
        // Repartition without replication
        job.setMapperClass(RepartitionMapNoReplication.class);
    } else {
        // Repartition with replication (grid, str+, and r+tree)
        job.setMapperClass(RepartitionMap.class);
    }
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(stockShape.getClass());
    CombinedSpatialInputFormat.setInputPaths(job, inputPaths);
    job.setInputFormat(CombinedSpatialInputFormat.class);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

    FileOutputFormat.setOutputPath(job, outputPath);
    if (sindex.equals("grid") || sindex.equals("str") || sindex.equals("str+")) {
        job.setOutputFormat(GridOutputFormat.class);
    } else if (sindex.equals("rtree") || sindex.equals("r+tree")) {
        // For now, the two types of local index are the same
        job.setOutputFormat(RTreeGridOutputFormat.class);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    SpatialSite.setCells(job, cellInfos);
    job.setBoolean(SpatialSite.OVERWRITE, overwrite);

    // Set reduce function
    job.setReducerClass(RepartitionReduce.class);
    job.setNumReduceTasks(
            Math.max(1, Math.min(cellInfos.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));

    // Set output committer that combines output files together
    job.setOutputCommitter(RepartitionOutputCommitter.class);

    JobClient.runJob(job);

}

From source file:edu.umn.cs.spatialHadoop.temporal.RepartitionTemporal.java

License:Apache License

public static void repartitionMapReduce(Path[] inputPaths, Path outputPath, Shape stockShape, long blockSize,
        CellInfo[] cellInfos, String sindex, boolean overwrite) throws IOException {

    JobConf job = new JobConf(Repartition.class);

    job.setJobName("RepartitionTemporal");
    FileSystem outFs = outputPath.getFileSystem(job);

    // Overwrite output file
    if (outFs.exists(outputPath)) {
        if (overwrite)
            outFs.delete(outputPath, true);
        else//from   w ww. j  a va2  s  .c  o m
            throw new RuntimeException(
                    "Output file '" + outputPath + "' already exists and overwrite flag is not set");
    }

    // Decide which map function to use depending on the type of global
    // index
    if (sindex.equals("rtree") || sindex.equals("str")) {
        // Repartition without replication
        job.setMapperClass(RepartitionMapNoReplication.class);
    } else {
        // Repartition with replication (grid and r+tree)
        job.setMapperClass(RepartitionMap.class);
    }
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(stockShape.getClass());
    CombinedSpatialInputFormat.setInputPaths(job, inputPaths);
    job.setInputFormat(CombinedSpatialInputFormat.class);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

    FileOutputFormat.setOutputPath(job, outputPath);
    if (sindex.equals("grid") || sindex.equals("str") || sindex.equals("str+")) {
        job.setOutputFormat(GridOutputFormat.class);
    } else if (sindex.equals("rtree") || sindex.equals("r+tree")) {
        // For now, the two types of local index are the same
        job.setOutputFormat(RTreeGridOutputFormat.class);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    SpatialSite.setCells(job, cellInfos);
    job.setBoolean(SpatialSite.OVERWRITE, overwrite);

    // Set reduce function
    job.setReducerClass(RepartitionReduce.class);
    job.setNumReduceTasks(
            Math.max(1, Math.min(cellInfos.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));

    // Set output committer that combines output files together
    job.setOutputCommitter(RepartitionOutputCommitter.class);

    if (blockSize != 0) {
        job.setLong("dfs.block.size", blockSize);
        job.setLong("fs.local.block.size", blockSize);
    }

    JobClient.runJob(job);
}

From source file:edu.umn.cs.sthadoop.operations.STJoin.java

License:Open Source License

/**
 * /*from  w  w  w . j a  v a 2  s .co m*/
 * @param inputPath
 * @param outputPath
 * @param params
 * @return
 * @throws IOException
 * @throws Exception
 * @throws InterruptedException
 */
private static long stJoin(Path inputPath, Path outputPath, OperationsParams params)
        throws IOException, Exception, InterruptedException {

    JobConf conf = new JobConf(new Configuration(), STJoin.class);
    FileSystem outfs = outputPath.getFileSystem(conf);
    outfs.delete(outputPath, true);
    conf.setJobName("STJoin");
    // pass params to the join map-reduce 
    conf.set("timedistance", params.get("timedistance"));
    conf.set("spacedistance", params.get("spacedistance"));
    //      conf.setMapOutputKeyClass(LongWritable.class);
    //      conf.setMapOutputValueClass(Text.class);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);
    // Mapper settings
    conf.setMapperClass(STJoinMap.class);
    //      conf.setReducerClass(STJoinReduce.class);
    //      conf.setCombinerClass(STJoinReduce.class);
    conf.setBoolean("mapreduce.input.fileinputformat.input.dir.recursive", true);
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);
    conf.setNumReduceTasks(0);
    JobClient.runJob(conf).waitForCompletion();
    outfs = inputPath.getFileSystem(conf);
    outfs.delete(inputPath);
    return 0;
}

From source file:edu.yale.cs.hadoopdb.benchmark.AggTaskLargeDB.java

License:Apache License

@Override
protected JobConf configureJob(String... args) throws Exception {

    JobConf conf = new JobConf(this.getClass());
    conf.setJobName("aggregation_db_large");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(Map.class);
    conf.setReducerClass(Reduce.class);

    if (args.length < 1) {
        throw new RuntimeException("Incorrect arguments provided for " + this.getClass());
    }/*from w  ww .  j  a  va2 s .c o  m*/

    // OUTPUT properties
    Path outputPath = new Path(args[0]);
    HDFSUtil.deletePath(outputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    conf.set(DBConst.DB_RELATION_ID, "UserVisits");
    conf.set(DBConst.DB_RECORD_READER, AggUserVisitsRecord.class.getName());
    conf.set(DBConst.DB_SQL_QUERY,
            "SELECT sourceIP, SUM(adRevenue) AS sumAdRevenue " + "FROM UserVisits GROUP BY sourceIP;");

    return conf;
}

From source file:edu.yale.cs.hadoopdb.benchmark.AggTaskLargeHDFS.java

License:Apache License

@Override
protected JobConf configureJob(String... args) throws IOException {

    JobConf conf = new JobConf(getConf(), this.getClass());
    conf.setJobName("aggregation_hdfs_large");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);

    conf.setMapperClass(AggTaskLargeHDFS.Map.class);
    conf.setCombinerClass(AggTaskLargeHDFS.Reduce.class);
    conf.setReducerClass(AggTaskLargeHDFS.Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    if (args.length < 2) {
        throw new RuntimeException("Incorrect arguments provided for " + this.getClass());
    }//from  www  .  j  av a2  s.c o m

    FileInputFormat.setInputPaths(conf, new Path(args[0]));

    // OUTPUT properties
    Path outputPath = new Path(args[1]);
    HDFSUtil.deletePath(outputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    return conf;

}

From source file:edu.yale.cs.hadoopdb.benchmark.AggTaskSmallDB.java

License:Apache License

@Override
protected JobConf configureJob(String... args) throws Exception {

    JobConf conf = new JobConf(this.getClass());
    conf.setJobName("aggregation_db_small");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(Map.class);
    conf.setReducerClass(Reduce.class);

    if (args.length < 1) {
        throw new RuntimeException("Incorrect arguments provided for " + this.getClass());
    }/*from ww  w. ja v  a2s.  co m*/

    // OUTPUT properties
    Path outputPath = new Path(args[0]);
    HDFSUtil.deletePath(outputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    conf.set(DBConst.DB_RELATION_ID, "UserVisits");
    conf.set(DBConst.DB_RECORD_READER, AggUserVisitsRecord.class.getName());
    conf.set(DBConst.DB_SQL_QUERY,
            "SELECT SUBSTRING(sourceIP, 1, 7) AS subSourceIP, SUM(adRevenue) AS sumAdRevenue FROM UserVisits GROUP BY subSourceIP;");

    return conf;
}

From source file:edu.yale.cs.hadoopdb.benchmark.AggTaskSmallHDFS.java

License:Apache License

@Override
protected JobConf configureJob(String... args) throws IOException {

    JobConf conf = new JobConf(this.getClass());
    conf.setJobName("aggregation_hdfs_small");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);

    conf.setMapperClass(AggTaskSmallHDFS.Map.class);
    conf.setCombinerClass(AggTaskSmallHDFS.Reduce.class);
    conf.setReducerClass(AggTaskSmallHDFS.Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    if (args.length < 2) {
        throw new RuntimeException("Incorrect arguments provided for " + this.getClass());
    }//ww  w  .j ava  2 s .  c  o m

    FileInputFormat.setInputPaths(conf, new Path(args[0]));

    // OUTPUT properties
    Path outputPath = new Path(args[1]);
    HDFSUtil.deletePath(outputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    return conf;

}

From source file:edu.yale.cs.hadoopdb.benchmark.GrepTaskDB.java

License:Apache License

@Override
protected JobConf configureJob(String... args) throws IOException {

    JobConf conf = new JobConf(GrepTaskDB.class);
    conf.setJobName("grep_db_job");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(Map.class);
    conf.setNumReduceTasks(0);//from w w  w . jav  a 2s.  c o  m

    // GREP arguments
    conf.setOutputFormat(TextOutputFormat.class);
    for (int i = 0; i < args.length; ++i) {
        if ("-pattern".equals(args[i]))
            conf.set("pattern", args[++i]);
        else if ("-output".equals(args[i]))
            conf.set("output", args[++i]);
    }

    // OUTPUT properties

    Path outputPath = new Path(conf.get("output"));
    System.out.println(conf.get("output"));
    HDFSUtil.deletePath(outputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    // DB properties
    conf.set(DBConst.DB_RELATION_ID, "grep");
    conf.set(DBConst.DB_RECORD_READER, DocumentsRecord.class.getName());
    conf.set(DBConst.DB_SQL_QUERY,
            "SELECT key1, field FROM grep WHERE field LIKE '%" + conf.get("pattern") + "%';");

    return conf;

}

From source file:edu.yale.cs.hadoopdb.benchmark.GrepTaskHDFS.java

License:Apache License

@Override
protected JobConf configureJob(String... args) throws IOException {

    JobConf conf = new JobConf(getConf(), this.getClass());
    conf.setJobName("grep_hdfs");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(Map.class);
    conf.setNumReduceTasks(0);/* ww  w  .  jav  a2  s  .  c  o  m*/

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    if (args.length < 3) {
        throw new RuntimeException("Incorrect arguments provided for " + this.getClass());
    }

    conf.set(GREP_PATTERN_PARAM, args[0]);

    FileInputFormat.setInputPaths(conf, new Path(args[1]));

    Path outputPath = new Path(args[2]);
    HDFSUtil.deletePath(outputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    return conf;

}

From source file:edu.yale.cs.hadoopdb.benchmark.JoinTaskDB.java

License:Apache License

@Override
protected JobConf configureJob(String... args) throws Exception {
    JobConf conf = new JobConf(JoinTaskDB.class);
    conf.setJobName("join_db");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(Map.class);
    conf.setReducerClass(Reduce.class);
    conf.setNumReduceTasks(1); // Because we look for 1 TOP value

    // join arguments
    conf.setOutputFormat(TextOutputFormat.class);
    for (int i = 0; i < args.length; ++i) {
        if ("-date_l".equals(args[i]))
            conf.set("date_l", args[++i]);
        else if ("-date_u".equals(args[i]))
            conf.set("date_u", args[++i]);
        else if ("-output".equals(args[i]))
            conf.set("output", args[++i]);
    }// w  ww  .  j  a va  2 s.  c o m

    // OUTPUT properties
    Path outputPath = new Path(conf.get("output"));
    HDFSUtil.deletePath(outputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    conf.set(DBConst.DB_RELATION_ID, "UserVisits");
    conf.set(DBConst.DB_RECORD_READER, JoinRecord.class.getName());

    String TABLE_R = "Rankings";
    String TABLE_UV = "UserVisits";

    conf.set(DBConst.DB_SQL_QUERY,
            "SELECT sourceIP, SUM(pageRank) as sumPageRank, COUNT(pageRank) as countPageRank, SUM(adRevenue) as totalRevenue "
                    + "FROM " + TABLE_R + " AS R, " + TABLE_UV + " AS UV " + "WHERE R.pageURL = UV.destURL "
                    + "AND UV.visitDate BETWEEN '" + conf.get("date_l") + "' AND '" + conf.get("date_u") + "' "
                    + "GROUP BY UV.sourceIP;");

    return conf;
}