Example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass)

Source Link

Document

Set the key class for the map output data.

Usage

From source file:org.cloudata.examples.web.DocFreqJob.java

License:Apache License

public void exec(String[] options) throws Exception {
    if (options.length < 1) {
        System.out.println("Usage: java DocFreqJob <num of repeats> docFreq [#reduce]");
        System.exit(0);/*w w  w. j  a v a 2  s  .  c o m*/
    }

    JobConf jobConf = new JobConf(DocFreqJob.class);

    JobClient jobClinet = new JobClient(jobConf);
    int maxReduce = jobClinet.getClusterStatus().getMaxReduceTasks() * 2;
    if (options.length > 0) {
        maxReduce = Integer.parseInt(options[0]);
    }

    CloudataConf nconf = new CloudataConf();
    if (!CTable.existsTable(nconf, TermUploadJob.TERM_TABLE)) {
        TableSchema temrTableInfo = new TableSchema(TermUploadJob.TERM_TABLE, "Test",
                TermUploadJob.TERM_TABLE_COLUMNS);
        CTable.createTable(nconf, temrTableInfo);
    }
    Path tempOutputPath = new Path("DocFreqJob_" + System.currentTimeMillis());

    jobConf.setJobName("DocFreqJob" + "(" + new Date() + ")");

    //<MAP>
    jobConf.setMapperClass(DocFreqMap.class);
    jobConf.setInputFormat(AbstractTabletInputFormat.class);
    jobConf.set(AbstractTabletInputFormat.INPUT_TABLE, WebTableJob.WEB_TABLE);
    jobConf.set(AbstractTabletInputFormat.INPUT_COLUMN_LIST,
            WebTableJob.WEB_TABLE_COLUMNS[1] + "," + WebTableJob.WEB_TABLE_COLUMNS[2]);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    //    jobConf.setMaxMapAttempts(0);
    //</MAP>

    //<REDUCE>
    jobConf.setReducerClass(DocFreqReduce.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);
    FileOutputFormat.setOutputPath(jobConf, tempOutputPath);
    jobConf.setNumReduceTasks(maxReduce);
    //</REDUCE>

    //Run Job
    JobClient.runJob(jobConf);

    //    //delete temp output path
    FileSystem fs = FileSystem.get(jobConf);
    fs.delete(tempOutputPath, true);
}

From source file:org.cloudata.examples.web.TermGlobalJob.java

License:Apache License

public void exec() throws Exception {
    CloudataConf nconf = new CloudataConf();
    if (!CTable.existsTable(nconf, GLOBAL_TABLE)) {
        TableSchema globalTableInfo = new TableSchema(GLOBAL_TABLE, "Test", GLOBAL_TABLE_COLUMNS);
        CTable.createTable(nconf, globalTableInfo);
    }/* www . j  a va 2 s. com*/

    Path tempOutputPath = new Path("globalTableInfo" + System.currentTimeMillis());

    JobConf jobConf = new JobConf(WebTableJob.class);
    jobConf.setJobName("TermGlobalJob" + "(" + new Date() + ")");

    //<MAP>
    jobConf.setMapperClass(TermGlobalMap.class);
    jobConf.setInputFormat(DefaultTabletInputFormat.class);
    jobConf.set(DefaultTabletInputFormat.INPUT_TABLE, WebTableJob.WEB_TABLE);
    jobConf.set(DefaultTabletInputFormat.INPUT_COLUMN_LIST, WebTableJob.WEB_TABLE_COLUMNS[2]);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    //</MAP>

    //<REDUCE>
    jobConf.setReducerClass(TermGlobalReduce.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);
    jobConf.setNumReduceTasks(1);
    jobConf.setMaxReduceAttempts(0);
    FileOutputFormat.setOutputPath(jobConf, tempOutputPath);
    //</REDUCE>

    //Run Job
    JobClient.runJob(jobConf);

    //delete temp output path
    FileSystem fs = FileSystem.get(jobConf);
    fs.delete(tempOutputPath, true);
}

From source file:org.cloudata.examples.web.TermUploadJob.java

License:Apache License

public void exec(String[] options) throws Exception {
    if (options.length < 1) {
        System.out.println("Usage: java TermUploadJob <num of repeats> termUpload <inputPath> [#redcue]");
        System.exit(0);/*from w w  w  .  j  ava 2s  . c  o  m*/
    }
    JobConf jobConf = new JobConf(TermUploadJob.class);
    JobClient jobClinet = new JobClient(jobConf);
    int maxReduce = jobClinet.getClusterStatus().getMaxReduceTasks() * 2;
    if (options.length > 1) {
        maxReduce = Integer.parseInt(options[1]);
    }

    jobConf.setInt("mapred.task.timeout", 60 * 60 * 1000);

    FileSystem fs = FileSystem.get(jobConf);

    CloudataConf nconf = new CloudataConf();
    if (!CTable.existsTable(nconf, TERM_TABLE)) {
        //Table  
        Path path = new Path("blogdata/tmp/weight");
        FileStatus[] paths = fs.listStatus(path);
        if (paths == null || paths.length == 0) {
            LOG.error("No Partition info:" + path);
            return;
        }
        SortedSet<Text> terms = new TreeSet<Text>();
        Text text = new Text();
        for (FileStatus eachPath : paths) {
            CloudataLineReader reader = new CloudataLineReader(fs.open(eachPath.getPath()));
            while (true) {
                int length = reader.readLine(text);
                if (length <= 0) {
                    break;
                }
                terms.add(new Text(text));
            }
        }

        int temrsPerTablet = terms.size() / (maxReduce - 1);
        int count = 0;
        List<Row.Key> rowKeys = new ArrayList<Row.Key>();
        for (Text term : terms) {
            count++;
            if (count == temrsPerTablet) {
                rowKeys.add(new Row.Key(term.getBytes()));
                count = 0;
            }
        }
        rowKeys.add(Row.Key.MAX_KEY);

        TableSchema temrTableInfo = new TableSchema(TERM_TABLE, "Test", TERM_TABLE_COLUMNS);
        CTable.createTable(nconf, temrTableInfo, rowKeys.toArray(new Row.Key[] {}));
    }
    CTable termTable = CTable.openTable(nconf, TERM_TABLE);
    TabletInfo[] tabletInfos = termTable.listTabletInfos();

    Path tempOutputPath = new Path("WebTableJob_" + System.currentTimeMillis());

    jobConf.setJobName("TermUploadJob" + "(" + new Date() + ")");
    FileInputFormat.addInputPath(jobConf, new Path(options[0]));

    //<MAP>
    jobConf.setMapperClass(TermUploadMap.class);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.setInputFormat(TextInputFormat.class);
    jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, TERM_TABLE);
    jobConf.setPartitionerClass(WebKeyRangePartitioner.class);
    jobConf.setMaxMapAttempts(0);
    //</MAP>

    //<REDUCE>
    jobConf.setReducerClass(TermUploadReduce.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);
    jobConf.setNumReduceTasks(tabletInfos.length);
    FileOutputFormat.setOutputPath(jobConf, tempOutputPath);
    jobConf.setNumReduceTasks(maxReduce);
    jobConf.setMaxReduceAttempts(0);
    //<REDUCE>

    //Run Job
    JobClient.runJob(jobConf);

    fs.delete(tempOutputPath);
}

From source file:org.cloudata.examples.web.TermWeightJob.java

License:Apache License

public void exec(String[] options) throws Exception {
    if (options.length < 1) {
        System.out.println("Usage: java TermWeightJob <num of repeats> termWeight <outputPath> [noGlobal]");
        System.exit(0);/*ww w. j a  va2s  .c o  m*/
    }
    if (options.length == 1 || !options[1].equals("noGlobal")) {
        TermGlobalJob termGlobalJob = new TermGlobalJob();
        termGlobalJob.exec();
    }
    Path outputPath = new Path(options[0]);

    JobConf jobConf = new JobConf(WebTableJob.class);
    jobConf.setJobName("TermWeightJob" + "(" + new Date() + ")");

    //<MAP>
    jobConf.setMapperClass(TermWeightMap.class);
    jobConf.setInputFormat(DefaultTabletInputFormat.class);
    jobConf.set(DefaultTabletInputFormat.INPUT_TABLE, WebTableJob.WEB_TABLE);
    jobConf.set(DefaultTabletInputFormat.INPUT_COLUMN_LIST,
            WebTableJob.WEB_TABLE_COLUMNS[1] + "," + WebTableJob.WEB_TABLE_COLUMNS[2]);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    //    jobConf.setMaxMapAttempts(0);
    //</MAP>

    //<REDUCE>
    jobConf.setReducerClass(TermWeightReduce.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);
    FileOutputFormat.setOutputPath(jobConf, outputPath);
    //jobConf.setMaxReduceAttempts(0);
    JobClient jobClinet = new JobClient(jobConf);
    int maxReduce = jobClinet.getClusterStatus().getMaxReduceTasks();

    jobConf.setNumReduceTasks(maxReduce);
    //</REDUCE>

    //Run Job
    JobClient.runJob(jobConf);

    //    //delete temp output path
    //    FileSystem fs = FileSystem.get(jobConf);
    //    fs.delete(outputPath);
}

From source file:org.cloudata.examples.web.TermWeightJobOnline.java

License:Apache License

public void exec(String[] options) throws Exception {
    if (options.length < 1) {
        System.out.println("Usage: java TermWeightJobOnline <num of repeats> termWeightOnline [noGlobal]");
        System.exit(0);/*from   w ww  .  java2s  .c  o  m*/
    }
    if (options.length == 1 || !options[1].equals("noGlobal")) {
        TermGlobalJob termGlobalJob = new TermGlobalJob();
        termGlobalJob.exec();
    }
    Path outputPath = new Path("TermWeightJobOnline_" + System.currentTimeMillis());

    JobConf jobConf = new JobConf(TermWeightJobOnline.class);
    jobConf.setJobName("TermWeightJobOnline" + "(" + new Date() + ")");

    //<MAP>
    jobConf.setMapperClass(TermWeightMap.class);
    jobConf.setInputFormat(DefaultTabletInputFormat.class);
    jobConf.set(DefaultTabletInputFormat.INPUT_TABLE, WebTableJob.WEB_TABLE);
    jobConf.set(DefaultTabletInputFormat.INPUT_COLUMN_LIST,
            WebTableJob.WEB_TABLE_COLUMNS[1] + "," + WebTableJob.WEB_TABLE_COLUMNS[2]);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    //    jobConf.setMaxMapAttempts(0);
    //</MAP>

    //<REDUCE>
    jobConf.setReducerClass(TermWeightReduceOnline.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);
    FileOutputFormat.setOutputPath(jobConf, outputPath);
    //jobConf.setMaxReduceAttempts(0);
    JobClient jobClinet = new JobClient(jobConf);
    int maxReduce = jobClinet.getClusterStatus().getMaxReduceTasks();

    jobConf.setNumReduceTasks(maxReduce);
    //</REDUCE>

    //Run Job
    try {
        JobClient.runJob(jobConf);
    } finally {
        //delete temp output path
        FileSystem fs = FileSystem.get(jobConf);
        fs.delete(outputPath, true);
    }
}

From source file:org.cloudata.examples.weblink.InLinkJob.java

License:Apache License

public static void main(String[] args) throws IOException {
    if (args.length < 2) {
        System.out.println("Usage: java InLinkJob <input table> <output table>");
        System.exit(0);//from  w  ww  . ja  v a 2 s .  c o  m
    }

    String inputTableName = args[0];
    String outputTableName = args[1];

    CloudataConf nconf = new CloudataConf();
    if (!CTable.existsTable(nconf, outputTableName)) {
        TableSchema tableSchema = new TableSchema(outputTableName);
        tableSchema.addColumn("inlink");

        CTable.createTable(nconf, tableSchema);
    }

    Path tempPath = new Path("InLinkJob_tmp" + System.currentTimeMillis());

    JobConf jobConf = new JobConf(InLinkJob.class);
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    //<Map>
    jobConf.setMapperClass(InLinkMap.class);
    jobConf.setInputFormat(DefaultTabletInputFormat.class);
    jobConf.set(DefaultTabletInputFormat.INPUT_TABLE, inputTableName);
    jobConf.set(DefaultTabletInputFormat.INPUT_COLUMN_LIST, "outlink");
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(TextArray.class);
    //</Map>

    //<Reduce>
    FileOutputFormat.setOutputPath(jobConf, tempPath);
    jobConf.setReducerClass(InLinkReduce.class);
    jobConf.set(DefaultTabletInputFormat.OUTPUT_TABLE, outputTableName);
    //</Reduce>

    try {
        JobClient.runJob(jobConf);
    } finally {
        CloudataFileSystem fs = CloudataFileSystem.get(nconf);
        fs.delete(new GPath(tempPath.toString()), true);
        CloudataMapReduceUtil.clearMapReduce(libDir);
    }
}

From source file:org.cloudata.examples.weblink.ScanJob.java

License:Apache License

public void run(String[] args) throws IOException {
    if (args.length < 1) {
        System.out.println("Usage: java ScanJob <table name>");
        System.exit(0);//from w w  w  .ja  v  a2 s  . c  om
    }

    String tableName = args[0];

    CloudataConf nconf = new CloudataConf();
    if (!CTable.existsTable(nconf, tableName)) {
        System.out.println("No table: " + tableName);
        System.exit(0);
    }

    JobConf jobConf = new JobConf(UploadJob.class);
    jobConf.setJobName("CloudataExamles.weblink.ScanJob_" + new Date());
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    // <MAP>
    jobConf.setInputFormat(DefaultTabletInputFormat.class);
    jobConf.setMapperClass(ScanJobMapper.class);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.set(AbstractTabletInputFormat.INPUT_TABLE, tableName);
    jobConf.set(AbstractTabletInputFormat.INPUT_COLUMN_LIST, "url");
    // </MAP>

    // <REDUCE>
    FileOutputFormat.setOutputPath(jobConf,
            new Path("CloudataExamles_WebScanJob_" + System.currentTimeMillis()));
    jobConf.setReducerClass(ScanJobReducer.class);
    jobConf.setNumReduceTasks(1);
    // </REDUCE>

    try {
        JobClient.runJob(jobConf);
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        CloudataMapReduceUtil.clearMapReduce(libDir);
    }
}

From source file:org.cloudata.util.matrix.AbstractMatrix.java

License:Apache License

public void mutiply(AbstractMatrix targetMatrix, AbstractMatrix resultMatrix) throws IOException {
    Path tempOutputPath = new Path("temp/Matrix_" + System.currentTimeMillis());

    JobConf jobConf = new JobConf(AbstractMatrix.class);
    jobConf.setJobName("Matrix_Mutiply_Job" + "(" + new Date() + ")");

    //<MAP>
    jobConf.setMapperClass(MatrixMutiplyMap.class);
    jobConf.setInputFormat(MatrixInputFormat.class);
    jobConf.set(MatrixInputFormat.MATRIX_INPUT_TABLE, ctable.getTableName());
    jobConf.set(MatrixInputFormat.MATRIX_INPUT_COLUMN, columnName);
    jobConf.set(MatrixInputFormat.MATRIX_TARGET_TABLE, targetMatrix.ctable.getTableName());
    jobConf.set(MatrixInputFormat.MATRIX_TARGET_COLUMN, targetMatrix.columnName);
    jobConf.setBoolean(MatrixInputFormat.MATRIX_TARGET_SPARSE, targetMatrix.isSparse());
    jobConf.setMapOutputKeyClass(MatrixItem.class);
    jobConf.setMapOutputValueClass(Text.class);
    //</MAP>

    //<REDUCE>
    jobConf.setPartitionerClass(KeyRangePartitioner.class);
    jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, resultMatrix.ctable.getTableName());
    jobConf.setReducerClass(MatrixMutiplyReduce.class);
    jobConf.set(MatrixInputFormat.MATRIX_RESULT_TABLE, resultMatrix.ctable.getTableName());
    jobConf.set(MatrixInputFormat.MATRIX_RESULT_COLUMN, resultMatrix.columnName);
    jobConf.setBoolean(MatrixInputFormat.MATRIX_RESULT_SPARSE, resultMatrix.isSparse());
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);

    TabletInfo[] tabletInfos = resultMatrix.ctable.listTabletInfos();

    jobConf.setNumReduceTasks(tabletInfos.length);
    jobConf.setMaxReduceAttempts(0);//from   ww  w  . ja v  a  2s  .  co  m
    FileOutputFormat.setOutputPath(jobConf, tempOutputPath);
    //</REDUCE>

    //Run Job
    JobClient.runJob(jobConf);

    //delete temp output path
    FileSystem fs = FileSystem.get(jobConf);
    fs.delete(tempOutputPath, true);
}

From source file:org.cloudata.util.upload.UploadUtil.java

License:Apache License

private void doHadoopUpload(CloudataConf conf) throws IOException {
    if (!CTable.existsTable(conf, tableName)) {
        throw new IOException("No table:" + tableName);
    }/*from w w w. j  av  a2 s.c o m*/

    JobConf jobConf = new JobConf(UploadUtil.class);
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    jobConf.setJobName("UploadJob_" + tableName + "(" + new Date() + ")");

    //KeyRangePartitioner    
    //AbstractTabletInputFormat.OUTPUT_TABLE? ? 
    jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName);

    //<Map>
    FileInputFormat.addInputPath(jobConf, new Path(inputPath));
    jobConf.setInputFormat(TextInputFormat.class);
    jobConf.set("uploadJob.delim", delim);
    String columnStr = "";
    for (String eachColumn : columns) {
        columnStr += eachColumn + ",";
    }
    jobConf.set("uploadJob.columns", columnStr);

    String fieldNumStr = "";
    for (int eachField : fieldNums) {
        fieldNumStr += eachField + ",";
    }
    jobConf.set("uploadJob.fieldNums", fieldNumStr);
    jobConf.setBoolean("uploadJob.keyValuePair", keyValuePair);
    jobConf.setMapperClass(UploadMap.class);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.setMapSpeculativeExecution(false);
    jobConf.setMaxMapAttempts(0);
    //</Map>

    //<Reduce>
    Path tempOutputPath = new Path("temp/uploadJob/" + tableName + "/reducer");
    FileOutputFormat.setOutputPath(jobConf, tempOutputPath);
    jobConf.setNumReduceTasks(0);
    //</Reduce>

    try {
        JobClient.runJob(jobConf);
    } finally {
        FileSystem fs = FileSystem.get(jobConf);
        FileUtil.delete(fs, tempOutputPath, true);
        CloudataMapReduceUtil.clearMapReduce(libDir);
    }
}

From source file:org.commoncrawl.hadoop.io.S3GetMetdataJob.java

License:Open Source License

public static void main(String[] args) {

    String accessKey = args[0];/* w  w  w .  j av a  2  s  .c  om*/
    String secretKey = args[1];

    String paths[] = {
            // "2008/06",
            // "2008/07",
            // "2008/08",
            // "2008/09",
            // "2008/10",
            // "2008/11",
            "2009" };

    for (int pathIndex = 0; pathIndex < paths.length; ++pathIndex) {

        LOG.info("Processing Path:" + paths[pathIndex]);

        JobConf job = new JobConf(S3GetMetdataJob.class);

        Path tempDir = new Path(
                job.get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        LOG.info("Output for Path:" + paths[pathIndex] + " is:" + tempDir);
        System.out.println("Output Path is:" + tempDir);

        job.setJobName("S3 To CrawlURLMetadata Job for Path:" + paths[pathIndex]);

        // setup s3 properties
        JetS3tARCSource.setMaxRetries(job, 1);
        // set up S3 credentials ...
        JetS3tARCSource.setAWSAccessKeyID(job, accessKey);
        JetS3tARCSource.setAWSSecretAccessKey(job, secretKey);
        ARCSplitCalculator.setFilesPerSplit(job, 25);
        // set up arc reader properties
        ArcFileReader.setIOTimeoutValue(30000);
        // set input prefixes ...
        JetS3tARCSource.setInputPrefixes(job, paths[pathIndex]);
        // and S3 bucket name ...
        JetS3tARCSource.setBucketName(job, "commoncrawl");
        // and setup arc source for ArcInputFormat
        ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class);
        // and set up input format ...
        job.setInputFormat(ARCInputFormat.class);
        // set mapper ...
        job.setMapRunnerClass(S3GetMetdataJob.class);
        // setup reducer (identity in this case ... )
        job.setReducerClass(IdentityReducer.class);
        // standard output format ...
        job.setOutputFormat(SequenceFileOutputFormat.class);
        // set output path
        job.setOutputPath(tempDir);
        // map output types
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(CrawlURLMetadata.class);
        // reduce output types
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlURLMetadata.class);
        // double the number of reducers ...
        // job.setNumReduceTasks(job.getNumReduceTasks() * 2);

        // run the job ...
        try {
            LOG.info("Starting Job:" + job.getJobName());
            JobClient.runJob(job);
            LOG.info("Finished Job:" + job.getJobName());

            Path finalPath = new Path("jobout/" + paths[pathIndex] + "/result");
            LOG.info("Copying Job Output to:" + finalPath);
            FileSystem fs = FileSystem.get(job);

            try {
                fs.mkdirs(finalPath.getParent());
                fs.rename(tempDir, finalPath);
                LOG.info("Copied Job Output to:" + finalPath);
            } finally {
                // fs.close();
            }

        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            e.printStackTrace();
        }
    }
}