Example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass

List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass) 

Source Link

Document

Set the key class for the map output data.

Usage

From source file:org.cloudata.examples.web.DocFreqJob.java

License:Apache License

public void exec(String[] options) throws Exception {
    if (options.length < 1) {
        System.out.println("Usage: java DocFreqJob <num of repeats> docFreq [#reduce]");
        System.exit(0);/*w w  w. j  a v a 2  s  .  c o m*/
    }

    JobConf jobConf = new JobConf(DocFreqJob.class);

    JobClient jobClinet = new JobClient(jobConf);
    int maxReduce = jobClinet.getClusterStatus().getMaxReduceTasks() * 2;
    if (options.length > 0) {
        maxReduce = Integer.parseInt(options[0]);
    }

    CloudataConf nconf = new CloudataConf();
    if (!CTable.existsTable(nconf, TermUploadJob.TERM_TABLE)) {
        TableSchema temrTableInfo = new TableSchema(TermUploadJob.TERM_TABLE, "Test",
                TermUploadJob.TERM_TABLE_COLUMNS);
        CTable.createTable(nconf, temrTableInfo);
    }
    Path tempOutputPath = new Path("DocFreqJob_" + System.currentTimeMillis());

    jobConf.setJobName("DocFreqJob" + "(" + new Date() + ")");

    //<MAP>
    jobConf.setMapperClass(DocFreqMap.class);
    jobConf.setInputFormat(AbstractTabletInputFormat.class);
    jobConf.set(AbstractTabletInputFormat.INPUT_TABLE, WebTableJob.WEB_TABLE);
    jobConf.set(AbstractTabletInputFormat.INPUT_COLUMN_LIST,
            WebTableJob.WEB_TABLE_COLUMNS[1] + "," + WebTableJob.WEB_TABLE_COLUMNS[2]);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    //    jobConf.setMaxMapAttempts(0);
    //</MAP>

    //<REDUCE>
    jobConf.setReducerClass(DocFreqReduce.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);
    FileOutputFormat.setOutputPath(jobConf, tempOutputPath);
    jobConf.setNumReduceTasks(maxReduce);
    //</REDUCE>

    //Run Job
    JobClient.runJob(jobConf);

    //    //delete temp output path
    FileSystem fs = FileSystem.get(jobConf);
    fs.delete(tempOutputPath, true);
}

From source file:org.cloudata.examples.web.TermGlobalJob.java

License:Apache License

public void exec() throws Exception {
    CloudataConf nconf = new CloudataConf();
    if (!CTable.existsTable(nconf, GLOBAL_TABLE)) {
        TableSchema globalTableInfo = new TableSchema(GLOBAL_TABLE, "Test", GLOBAL_TABLE_COLUMNS);
        CTable.createTable(nconf, globalTableInfo);
    }/* www . j  a va 2 s. com*/

    Path tempOutputPath = new Path("globalTableInfo" + System.currentTimeMillis());

    JobConf jobConf = new JobConf(WebTableJob.class);
    jobConf.setJobName("TermGlobalJob" + "(" + new Date() + ")");

    //<MAP>
    jobConf.setMapperClass(TermGlobalMap.class);
    jobConf.setInputFormat(DefaultTabletInputFormat.class);
    jobConf.set(DefaultTabletInputFormat.INPUT_TABLE, WebTableJob.WEB_TABLE);
    jobConf.set(DefaultTabletInputFormat.INPUT_COLUMN_LIST, WebTableJob.WEB_TABLE_COLUMNS[2]);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    //</MAP>

    //<REDUCE>
    jobConf.setReducerClass(TermGlobalReduce.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);
    jobConf.setNumReduceTasks(1);
    jobConf.setMaxReduceAttempts(0);
    FileOutputFormat.setOutputPath(jobConf, tempOutputPath);
    //</REDUCE>

    //Run Job
    JobClient.runJob(jobConf);

    //delete temp output path
    FileSystem fs = FileSystem.get(jobConf);
    fs.delete(tempOutputPath, true);
}

From source file:org.cloudata.examples.web.TermUploadJob.java

License:Apache License

public void exec(String[] options) throws Exception {
    if (options.length < 1) {
        System.out.println("Usage: java TermUploadJob <num of repeats> termUpload <inputPath> [#redcue]");
        System.exit(0);/*from w w  w  .  j  ava 2s  . c  o  m*/
    }
    JobConf jobConf = new JobConf(TermUploadJob.class);
    JobClient jobClinet = new JobClient(jobConf);
    int maxReduce = jobClinet.getClusterStatus().getMaxReduceTasks() * 2;
    if (options.length > 1) {
        maxReduce = Integer.parseInt(options[1]);
    }

    jobConf.setInt("mapred.task.timeout", 60 * 60 * 1000);

    FileSystem fs = FileSystem.get(jobConf);

    CloudataConf nconf = new CloudataConf();
    if (!CTable.existsTable(nconf, TERM_TABLE)) {
        //Table  
        Path path = new Path("blogdata/tmp/weight");
        FileStatus[] paths = fs.listStatus(path);
        if (paths == null || paths.length == 0) {
            LOG.error("No Partition info:" + path);
            return;
        }
        SortedSet<Text> terms = new TreeSet<Text>();
        Text text = new Text();
        for (FileStatus eachPath : paths) {
            CloudataLineReader reader = new CloudataLineReader(fs.open(eachPath.getPath()));
            while (true) {
                int length = reader.readLine(text);
                if (length <= 0) {
                    break;
                }
                terms.add(new Text(text));
            }
        }

        int temrsPerTablet = terms.size() / (maxReduce - 1);
        int count = 0;
        List<Row.Key> rowKeys = new ArrayList<Row.Key>();
        for (Text term : terms) {
            count++;
            if (count == temrsPerTablet) {
                rowKeys.add(new Row.Key(term.getBytes()));
                count = 0;
            }
        }
        rowKeys.add(Row.Key.MAX_KEY);

        TableSchema temrTableInfo = new TableSchema(TERM_TABLE, "Test", TERM_TABLE_COLUMNS);
        CTable.createTable(nconf, temrTableInfo, rowKeys.toArray(new Row.Key[] {}));
    }
    CTable termTable = CTable.openTable(nconf, TERM_TABLE);
    TabletInfo[] tabletInfos = termTable.listTabletInfos();

    Path tempOutputPath = new Path("WebTableJob_" + System.currentTimeMillis());

    jobConf.setJobName("TermUploadJob" + "(" + new Date() + ")");
    FileInputFormat.addInputPath(jobConf, new Path(options[0]));

    //<MAP>
    jobConf.setMapperClass(TermUploadMap.class);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.setInputFormat(TextInputFormat.class);
    jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, TERM_TABLE);
    jobConf.setPartitionerClass(WebKeyRangePartitioner.class);
    jobConf.setMaxMapAttempts(0);
    //</MAP>

    //<REDUCE>
    jobConf.setReducerClass(TermUploadReduce.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);
    jobConf.setNumReduceTasks(tabletInfos.length);
    FileOutputFormat.setOutputPath(jobConf, tempOutputPath);
    jobConf.setNumReduceTasks(maxReduce);
    jobConf.setMaxReduceAttempts(0);
    //<REDUCE>

    //Run Job
    JobClient.runJob(jobConf);

    fs.delete(tempOutputPath);
}

From source file:org.cloudata.examples.web.TermWeightJob.java

License:Apache License

public void exec(String[] options) throws Exception {
    if (options.length < 1) {
        System.out.println("Usage: java TermWeightJob <num of repeats> termWeight <outputPath> [noGlobal]");
        System.exit(0);/*ww w. j a  va2s  .c o  m*/
    }
    if (options.length == 1 || !options[1].equals("noGlobal")) {
        TermGlobalJob termGlobalJob = new TermGlobalJob();
        termGlobalJob.exec();
    }
    Path outputPath = new Path(options[0]);

    JobConf jobConf = new JobConf(WebTableJob.class);
    jobConf.setJobName("TermWeightJob" + "(" + new Date() + ")");

    //<MAP>
    jobConf.setMapperClass(TermWeightMap.class);
    jobConf.setInputFormat(DefaultTabletInputFormat.class);
    jobConf.set(DefaultTabletInputFormat.INPUT_TABLE, WebTableJob.WEB_TABLE);
    jobConf.set(DefaultTabletInputFormat.INPUT_COLUMN_LIST,
            WebTableJob.WEB_TABLE_COLUMNS[1] + "," + WebTableJob.WEB_TABLE_COLUMNS[2]);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    //    jobConf.setMaxMapAttempts(0);
    //</MAP>

    //<REDUCE>
    jobConf.setReducerClass(TermWeightReduce.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);
    FileOutputFormat.setOutputPath(jobConf, outputPath);
    //jobConf.setMaxReduceAttempts(0);
    JobClient jobClinet = new JobClient(jobConf);
    int maxReduce = jobClinet.getClusterStatus().getMaxReduceTasks();

    jobConf.setNumReduceTasks(maxReduce);
    //</REDUCE>

    //Run Job
    JobClient.runJob(jobConf);

    //    //delete temp output path
    //    FileSystem fs = FileSystem.get(jobConf);
    //    fs.delete(outputPath);
}

From source file:org.cloudata.examples.web.TermWeightJobOnline.java

License:Apache License

public void exec(String[] options) throws Exception {
    if (options.length < 1) {
        System.out.println("Usage: java TermWeightJobOnline <num of repeats> termWeightOnline [noGlobal]");
        System.exit(0);/*from   w ww  .  java2s  .c  o  m*/
    }
    if (options.length == 1 || !options[1].equals("noGlobal")) {
        TermGlobalJob termGlobalJob = new TermGlobalJob();
        termGlobalJob.exec();
    }
    Path outputPath = new Path("TermWeightJobOnline_" + System.currentTimeMillis());

    JobConf jobConf = new JobConf(TermWeightJobOnline.class);
    jobConf.setJobName("TermWeightJobOnline" + "(" + new Date() + ")");

    //<MAP>
    jobConf.setMapperClass(TermWeightMap.class);
    jobConf.setInputFormat(DefaultTabletInputFormat.class);
    jobConf.set(DefaultTabletInputFormat.INPUT_TABLE, WebTableJob.WEB_TABLE);
    jobConf.set(DefaultTabletInputFormat.INPUT_COLUMN_LIST,
            WebTableJob.WEB_TABLE_COLUMNS[1] + "," + WebTableJob.WEB_TABLE_COLUMNS[2]);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    //    jobConf.setMaxMapAttempts(0);
    //</MAP>

    //<REDUCE>
    jobConf.setReducerClass(TermWeightReduceOnline.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);
    FileOutputFormat.setOutputPath(jobConf, outputPath);
    //jobConf.setMaxReduceAttempts(0);
    JobClient jobClinet = new JobClient(jobConf);
    int maxReduce = jobClinet.getClusterStatus().getMaxReduceTasks();

    jobConf.setNumReduceTasks(maxReduce);
    //</REDUCE>

    //Run Job
    try {
        JobClient.runJob(jobConf);
    } finally {
        //delete temp output path
        FileSystem fs = FileSystem.get(jobConf);
        fs.delete(outputPath, true);
    }
}

From source file:org.cloudata.examples.weblink.InLinkJob.java

License:Apache License

public static void main(String[] args) throws IOException {
    if (args.length < 2) {
        System.out.println("Usage: java InLinkJob <input table> <output table>");
        System.exit(0);//from  w  ww  . ja  v a 2 s .  c o  m
    }

    String inputTableName = args[0];
    String outputTableName = args[1];

    CloudataConf nconf = new CloudataConf();
    if (!CTable.existsTable(nconf, outputTableName)) {
        TableSchema tableSchema = new TableSchema(outputTableName);
        tableSchema.addColumn("inlink");

        CTable.createTable(nconf, tableSchema);
    }

    Path tempPath = new Path("InLinkJob_tmp" + System.currentTimeMillis());

    JobConf jobConf = new JobConf(InLinkJob.class);
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    //<Map>
    jobConf.setMapperClass(InLinkMap.class);
    jobConf.setInputFormat(DefaultTabletInputFormat.class);
    jobConf.set(DefaultTabletInputFormat.INPUT_TABLE, inputTableName);
    jobConf.set(DefaultTabletInputFormat.INPUT_COLUMN_LIST, "outlink");
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(TextArray.class);
    //</Map>

    //<Reduce>
    FileOutputFormat.setOutputPath(jobConf, tempPath);
    jobConf.setReducerClass(InLinkReduce.class);
    jobConf.set(DefaultTabletInputFormat.OUTPUT_TABLE, outputTableName);
    //</Reduce>

    try {
        JobClient.runJob(jobConf);
    } finally {
        CloudataFileSystem fs = CloudataFileSystem.get(nconf);
        fs.delete(new GPath(tempPath.toString()), true);
        CloudataMapReduceUtil.clearMapReduce(libDir);
    }
}

From source file:org.cloudata.examples.weblink.ScanJob.java

License:Apache License

public void run(String[] args) throws IOException {
    if (args.length < 1) {
        System.out.println("Usage: java ScanJob <table name>");
        System.exit(0);//from w w  w  .ja  v  a2 s  . c  om
    }

    String tableName = args[0];

    CloudataConf nconf = new CloudataConf();
    if (!CTable.existsTable(nconf, tableName)) {
        System.out.println("No table: " + tableName);
        System.exit(0);
    }

    JobConf jobConf = new JobConf(UploadJob.class);
    jobConf.setJobName("CloudataExamles.weblink.ScanJob_" + new Date());
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    // <MAP>
    jobConf.setInputFormat(DefaultTabletInputFormat.class);
    jobConf.setMapperClass(ScanJobMapper.class);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.set(AbstractTabletInputFormat.INPUT_TABLE, tableName);
    jobConf.set(AbstractTabletInputFormat.INPUT_COLUMN_LIST, "url");
    // </MAP>

    // <REDUCE>
    FileOutputFormat.setOutputPath(jobConf,
            new Path("CloudataExamles_WebScanJob_" + System.currentTimeMillis()));
    jobConf.setReducerClass(ScanJobReducer.class);
    jobConf.setNumReduceTasks(1);
    // </REDUCE>

    try {
        JobClient.runJob(jobConf);
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        CloudataMapReduceUtil.clearMapReduce(libDir);
    }
}

From source file:org.cloudata.util.matrix.AbstractMatrix.java

License:Apache License

public void mutiply(AbstractMatrix targetMatrix, AbstractMatrix resultMatrix) throws IOException {
    Path tempOutputPath = new Path("temp/Matrix_" + System.currentTimeMillis());

    JobConf jobConf = new JobConf(AbstractMatrix.class);
    jobConf.setJobName("Matrix_Mutiply_Job" + "(" + new Date() + ")");

    //<MAP>
    jobConf.setMapperClass(MatrixMutiplyMap.class);
    jobConf.setInputFormat(MatrixInputFormat.class);
    jobConf.set(MatrixInputFormat.MATRIX_INPUT_TABLE, ctable.getTableName());
    jobConf.set(MatrixInputFormat.MATRIX_INPUT_COLUMN, columnName);
    jobConf.set(MatrixInputFormat.MATRIX_TARGET_TABLE, targetMatrix.ctable.getTableName());
    jobConf.set(MatrixInputFormat.MATRIX_TARGET_COLUMN, targetMatrix.columnName);
    jobConf.setBoolean(MatrixInputFormat.MATRIX_TARGET_SPARSE, targetMatrix.isSparse());
    jobConf.setMapOutputKeyClass(MatrixItem.class);
    jobConf.setMapOutputValueClass(Text.class);
    //</MAP>

    //<REDUCE>
    jobConf.setPartitionerClass(KeyRangePartitioner.class);
    jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, resultMatrix.ctable.getTableName());
    jobConf.setReducerClass(MatrixMutiplyReduce.class);
    jobConf.set(MatrixInputFormat.MATRIX_RESULT_TABLE, resultMatrix.ctable.getTableName());
    jobConf.set(MatrixInputFormat.MATRIX_RESULT_COLUMN, resultMatrix.columnName);
    jobConf.setBoolean(MatrixInputFormat.MATRIX_RESULT_SPARSE, resultMatrix.isSparse());
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);

    TabletInfo[] tabletInfos = resultMatrix.ctable.listTabletInfos();

    jobConf.setNumReduceTasks(tabletInfos.length);
    jobConf.setMaxReduceAttempts(0);//from   ww  w  . ja v  a  2s  .  co  m
    FileOutputFormat.setOutputPath(jobConf, tempOutputPath);
    //</REDUCE>

    //Run Job
    JobClient.runJob(jobConf);

    //delete temp output path
    FileSystem fs = FileSystem.get(jobConf);
    fs.delete(tempOutputPath, true);
}

From source file:org.cloudata.util.upload.UploadUtil.java

License:Apache License

private void doHadoopUpload(CloudataConf conf) throws IOException {
    if (!CTable.existsTable(conf, tableName)) {
        throw new IOException("No table:" + tableName);
    }/*from w w w. j  av  a2 s.c o m*/

    JobConf jobConf = new JobConf(UploadUtil.class);
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    jobConf.setJobName("UploadJob_" + tableName + "(" + new Date() + ")");

    //KeyRangePartitioner    
    //AbstractTabletInputFormat.OUTPUT_TABLE? ? 
    jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName);

    //<Map>
    FileInputFormat.addInputPath(jobConf, new Path(inputPath));
    jobConf.setInputFormat(TextInputFormat.class);
    jobConf.set("uploadJob.delim", delim);
    String columnStr = "";
    for (String eachColumn : columns) {
        columnStr += eachColumn + ",";
    }
    jobConf.set("uploadJob.columns", columnStr);

    String fieldNumStr = "";
    for (int eachField : fieldNums) {
        fieldNumStr += eachField + ",";
    }
    jobConf.set("uploadJob.fieldNums", fieldNumStr);
    jobConf.setBoolean("uploadJob.keyValuePair", keyValuePair);
    jobConf.setMapperClass(UploadMap.class);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.setMapSpeculativeExecution(false);
    jobConf.setMaxMapAttempts(0);
    //</Map>

    //<Reduce>
    Path tempOutputPath = new Path("temp/uploadJob/" + tableName + "/reducer");
    FileOutputFormat.setOutputPath(jobConf, tempOutputPath);
    jobConf.setNumReduceTasks(0);
    //</Reduce>

    try {
        JobClient.runJob(jobConf);
    } finally {
        FileSystem fs = FileSystem.get(jobConf);
        FileUtil.delete(fs, tempOutputPath, true);
        CloudataMapReduceUtil.clearMapReduce(libDir);
    }
}

From source file:org.commoncrawl.hadoop.io.S3GetMetdataJob.java

License:Open Source License

public static void main(String[] args) {

    String accessKey = args[0];/* w  w  w .  j av a  2  s  .c  om*/
    String secretKey = args[1];

    String paths[] = {
            // "2008/06",
            // "2008/07",
            // "2008/08",
            // "2008/09",
            // "2008/10",
            // "2008/11",
            "2009" };

    for (int pathIndex = 0; pathIndex < paths.length; ++pathIndex) {

        LOG.info("Processing Path:" + paths[pathIndex]);

        JobConf job = new JobConf(S3GetMetdataJob.class);

        Path tempDir = new Path(
                job.get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        LOG.info("Output for Path:" + paths[pathIndex] + " is:" + tempDir);
        System.out.println("Output Path is:" + tempDir);

        job.setJobName("S3 To CrawlURLMetadata Job for Path:" + paths[pathIndex]);

        // setup s3 properties
        JetS3tARCSource.setMaxRetries(job, 1);
        // set up S3 credentials ...
        JetS3tARCSource.setAWSAccessKeyID(job, accessKey);
        JetS3tARCSource.setAWSSecretAccessKey(job, secretKey);
        ARCSplitCalculator.setFilesPerSplit(job, 25);
        // set up arc reader properties
        ArcFileReader.setIOTimeoutValue(30000);
        // set input prefixes ...
        JetS3tARCSource.setInputPrefixes(job, paths[pathIndex]);
        // and S3 bucket name ...
        JetS3tARCSource.setBucketName(job, "commoncrawl");
        // and setup arc source for ArcInputFormat
        ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class);
        // and set up input format ...
        job.setInputFormat(ARCInputFormat.class);
        // set mapper ...
        job.setMapRunnerClass(S3GetMetdataJob.class);
        // setup reducer (identity in this case ... )
        job.setReducerClass(IdentityReducer.class);
        // standard output format ...
        job.setOutputFormat(SequenceFileOutputFormat.class);
        // set output path
        job.setOutputPath(tempDir);
        // map output types
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(CrawlURLMetadata.class);
        // reduce output types
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlURLMetadata.class);
        // double the number of reducers ...
        // job.setNumReduceTasks(job.getNumReduceTasks() * 2);

        // run the job ...
        try {
            LOG.info("Starting Job:" + job.getJobName());
            JobClient.runJob(job);
            LOG.info("Finished Job:" + job.getJobName());

            Path finalPath = new Path("jobout/" + paths[pathIndex] + "/result");
            LOG.info("Copying Job Output to:" + finalPath);
            FileSystem fs = FileSystem.get(job);

            try {
                fs.mkdirs(finalPath.getParent());
                fs.rename(tempDir, finalPath);
                LOG.info("Copied Job Output to:" + finalPath);
            } finally {
                // fs.close();
            }

        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            e.printStackTrace();
        }
    }
}