Example usage for org.apache.hadoop.mapred JobConf setInt

List of usage examples for org.apache.hadoop.mapred JobConf setInt

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setInt.

Prototype

public void setInt(String name, int value) 

Source Link

Document

Set the value of the name property to an int.

Usage

From source file:org.archive.hadoop.jobs.CDXGenerator.java

License:Apache License

/**
* Run the job./*from w  w  w .  j a  v a  2  s .c  om*/
*/
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        usage();
        return 1;
    }

    // Create a job configuration
    JobConf job = new JobConf(getConf());

    // Job name uses output dir to help identify it to the operator.
    job.setJobName("CDX Generator " + args[0]);

    // The inputs are a list of filenames, use the
    // FilenameInputFormat to pass them to the mappers.
    job.setInputFormat(FilenameInputFormat.class);

    // This is a map-only job, no reducers.
    job.setNumReduceTasks(0);

    // set timeout to a high value - 20 hours
    job.setInt("mapred.task.timeout", 72000000);

    // keep job running despite some failures in generating CDXs
    job.setBoolean("strictMode", false);

    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(CDXGeneratorMapper.class);
    job.setJarByClass(CDXGenerator.class);

    int arg = 0;
    if (args[arg].equals("-strictMode")) {
        job.setBoolean("strictMode", true);
        arg++;
    }

    String outputDir = args[arg];
    arg++;

    job.set("outputDir", outputDir);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));

    boolean atLeastOneInput = false;
    for (int i = arg; i < args.length; i++) {
        FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf());
        for (FileStatus status : inputfs.globStatus(new Path(args[i]))) {
            Path inputPath = status.getPath();
            atLeastOneInput = true;
            LOG.info("Add input path: " + inputPath);
            FileInputFormat.addInputPath(job, inputPath);
        }
    }
    if (!atLeastOneInput) {
        LOG.info("No input files to CDXGenerator.");
        return 0;
    }

    // Run the job!
    RunningJob rj = JobClient.runJob(job);
    if (!rj.isSuccessful()) {
        LOG.error("FAILED: " + rj.getID());
        return 2;
    }
    return 0;
}

From source file:org.archive.hadoop.jobs.WARCMetadataRecordGenerator.java

License:Apache License

/**
* Run the job.//from w  ww  .j a va2s .  com
*/
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        usage();
        return 1;
    }

    // Create a job configuration
    JobConf job = new JobConf(getConf());

    // Job name uses output dir to help identify it to the operator.
    job.setJobName("WARCMetadataRecord Generator " + args[0]);

    // The inputs are a list of filenames, use the
    // FilenameInputFormat to pass them to the mappers.
    job.setInputFormat(FilenameInputFormat.class);

    // This is a map-only job, no reducers.
    job.setNumReduceTasks(0);

    // set timeout to a high value - 20 hours
    job.setInt("mapred.task.timeout", 72000000);

    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(WARCMetadataRecordGeneratorMapper.class);
    job.setJarByClass(WARCMetadataRecordGenerator.class);

    //extract outlinks by default
    job.set("outputType", "outlinks");
    int arg = 0;
    if (args[arg].equals("-hopinfo")) {
        job.set("outputType", "hopinfo");
        arg++;
    }

    String outputDir = args[arg];
    arg++;

    job.set("outputDir", outputDir);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));

    boolean atLeastOneInput = false;
    for (int i = arg; i < args.length; i++) {
        FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf());
        for (FileStatus status : inputfs.globStatus(new Path(args[i]))) {
            Path inputPath = status.getPath();
            atLeastOneInput = true;
            LOG.info("Add input path: " + inputPath);
            FileInputFormat.addInputPath(job, inputPath);
        }
    }
    if (!atLeastOneInput) {
        LOG.info("No input files to WARCMetadataRecordGenerator.");
        return 0;
    }

    // Run the job!
    RunningJob rj = JobClient.runJob(job);
    if (!rj.isSuccessful()) {
        LOG.error("FAILED: " + rj.getID());
        return 2;
    }
    return 0;
}

From source file:org.archive.hadoop.jobs.WATGenerator.java

License:Apache License

/**
* Run the job.//from w w  w.java2s  .  c  om
*/
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        usage();
        return 1;
    }

    // Create a job configuration
    JobConf job = new JobConf(getConf());

    // Job name uses output dir to help identify it to the operator.
    job.setJobName("WAT Generator " + args[0]);

    // The inputs are a list of filenames, use the
    // FilenameInputFormat to pass them to the mappers.
    job.setInputFormat(FilenameInputFormat.class);

    // This is a map-only job, no reducers.
    job.setNumReduceTasks(0);

    // set timeout to a high value - 20 hours
    job.setInt("mapred.task.timeout", 72000000);

    // keep job running despite some failures in generating WATs
    job.setBoolean("strictMode", false);

    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(WATGeneratorMapper.class);
    job.setJarByClass(WATGenerator.class);

    int arg = 0;
    if (args[arg].equals("-strictMode")) {
        job.setBoolean("strictMode", true);
        arg++;
    }

    String outputDir = args[arg];
    arg++;

    job.set("outputDir", outputDir);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));

    boolean atLeastOneInput = false;
    for (int i = arg; i < args.length; i++) {
        FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf());
        for (FileStatus status : inputfs.globStatus(new Path(args[i]))) {
            Path inputPath = status.getPath();
            atLeastOneInput = true;
            LOG.info("Add input path: " + inputPath);
            FileInputFormat.addInputPath(job, inputPath);
        }
    }
    if (!atLeastOneInput) {
        LOG.info("No input files to WATGenerator.");
        return 0;
    }

    // Run the job!
    RunningJob rj = JobClient.runJob(job);
    if (!rj.isSuccessful()) {
        LOG.error("FAILED: " + rj.getID());
        return 2;
    }
    return 0;
}

From source file:org.cloudata.core.testjob.performance.TestMultiThreadCTable.java

License:Apache License

public static Path putData(String outputDir) throws IOException {
    CloudataConf nconf = new CloudataConf();

    JobConf jobConf = new JobConf(TestMultiThreadCTable.class);
    jobConf.set("user.name", nconf.getUserId());
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    jobConf.setJobName("TestMultiThreadNTable_" + "(" + new Date() + ")");

    jobConf.setLong("mapred.task.timeout", 30 * 60 * 1000);

    Path outputPath = new Path(outputDir);

    FileOutputFormat.setOutputPath(jobConf, outputPath);

    JobClient jobClient = new JobClient();

    int numOfRowPerMap = 100000 / jobClient.getClusterStatus().getMaxMapTasks();
    jobConf.setInt("numOfRowPerMap", numOfRowPerMap);
    //<MAP>
    jobConf.setMapperClass(PutDataMap.class);
    jobConf.setInputFormat(SimpleInputFormat.class);
    jobConf.setNumMapTasks(jobClient.getClusterStatus().getMaxMapTasks());
    jobConf.setMapSpeculativeExecution(false);
    jobConf.setMaxMapAttempts(0);/*from   www  .  j ava 2  s.  com*/
    //</MAP>

    //<REDUCE>
    jobConf.setNumReduceTasks(0);
    //</REDUCE>

    try {
        //Run Job
        JobClient.runJob(jobConf);
        return outputPath;
    } finally {
        FileSystem fs = FileSystem.get(jobConf);
        CloudataMapReduceUtil.clearMapReduce(libDir);
    }
}

From source file:org.cloudata.core.testjob.tera.TeraJob.java

License:Apache License

public void runJob(String tableName, int numOfTablets, int dataLength, int totalGb, String keyOutputPath)
        throws IOException {
    CloudataConf nconf = new CloudataConf();

    JobConf jobConf = new JobConf(TeraJob.class);
    jobConf.set("user.name", nconf.getUserId());
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    if (!CTable.existsTable(nconf, tableName)) {
        TableSchema tableInfo = new TableSchema(tableName, "Test");
        tableInfo.addColumn(new ColumnInfo("Col1"));
        tableInfo.addColumn(new ColumnInfo("Col2", TableSchema.CACHE_TYPE));
        tableInfo.addColumn(new ColumnInfo("Col3"));
        CTable.createTable(nconf, tableInfo);
    }//from  w  w w  .  ja v a  2  s.c  om
    jobConf.setJobName("TeraOnlineJob" + "(" + new Date() + ")");

    long rowsPerTask = ((((long) totalGb) * 1024L * 1024L * 1024L) / ((long) dataLength)) / (long) numOfTablets;

    jobConf.setInt("teraJob.dataLength", dataLength);
    jobConf.setLong("teraJob.rowsPerTask", rowsPerTask);

    jobConf.setLong("mapred.task.timeout", 30 * 60 * 1000);

    FileOutputFormat.setOutputPath(jobConf, new Path(keyOutputPath));

    //<MAP>
    jobConf.setMapperClass(TeraOnlineMap.class);
    jobConf.setInputFormat(SimpleInputFormat.class);
    jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName);
    jobConf.setNumMapTasks(numOfTablets);
    jobConf.setMapSpeculativeExecution(false);
    jobConf.setMaxMapAttempts(0);
    //</MAP>

    //<REDUCE>
    jobConf.setNumReduceTasks(0);
    //</REDUCE>

    try {
        //Run Job
        JobClient.runJob(jobConf);
    } finally {
        //delete temp output path
        FileSystem fs = FileSystem.get(jobConf);
        CloudataMapReduceUtil.clearMapReduce(libDir);
    }
}

From source file:org.cloudata.examples.web.TermUploadJob.java

License:Apache License

public void exec(String[] options) throws Exception {
    if (options.length < 1) {
        System.out.println("Usage: java TermUploadJob <num of repeats> termUpload <inputPath> [#redcue]");
        System.exit(0);/* ww  w.j  a  v a2  s  .  c o  m*/
    }
    JobConf jobConf = new JobConf(TermUploadJob.class);
    JobClient jobClinet = new JobClient(jobConf);
    int maxReduce = jobClinet.getClusterStatus().getMaxReduceTasks() * 2;
    if (options.length > 1) {
        maxReduce = Integer.parseInt(options[1]);
    }

    jobConf.setInt("mapred.task.timeout", 60 * 60 * 1000);

    FileSystem fs = FileSystem.get(jobConf);

    CloudataConf nconf = new CloudataConf();
    if (!CTable.existsTable(nconf, TERM_TABLE)) {
        //Table  
        Path path = new Path("blogdata/tmp/weight");
        FileStatus[] paths = fs.listStatus(path);
        if (paths == null || paths.length == 0) {
            LOG.error("No Partition info:" + path);
            return;
        }
        SortedSet<Text> terms = new TreeSet<Text>();
        Text text = new Text();
        for (FileStatus eachPath : paths) {
            CloudataLineReader reader = new CloudataLineReader(fs.open(eachPath.getPath()));
            while (true) {
                int length = reader.readLine(text);
                if (length <= 0) {
                    break;
                }
                terms.add(new Text(text));
            }
        }

        int temrsPerTablet = terms.size() / (maxReduce - 1);
        int count = 0;
        List<Row.Key> rowKeys = new ArrayList<Row.Key>();
        for (Text term : terms) {
            count++;
            if (count == temrsPerTablet) {
                rowKeys.add(new Row.Key(term.getBytes()));
                count = 0;
            }
        }
        rowKeys.add(Row.Key.MAX_KEY);

        TableSchema temrTableInfo = new TableSchema(TERM_TABLE, "Test", TERM_TABLE_COLUMNS);
        CTable.createTable(nconf, temrTableInfo, rowKeys.toArray(new Row.Key[] {}));
    }
    CTable termTable = CTable.openTable(nconf, TERM_TABLE);
    TabletInfo[] tabletInfos = termTable.listTabletInfos();

    Path tempOutputPath = new Path("WebTableJob_" + System.currentTimeMillis());

    jobConf.setJobName("TermUploadJob" + "(" + new Date() + ")");
    FileInputFormat.addInputPath(jobConf, new Path(options[0]));

    //<MAP>
    jobConf.setMapperClass(TermUploadMap.class);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.setInputFormat(TextInputFormat.class);
    jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, TERM_TABLE);
    jobConf.setPartitionerClass(WebKeyRangePartitioner.class);
    jobConf.setMaxMapAttempts(0);
    //</MAP>

    //<REDUCE>
    jobConf.setReducerClass(TermUploadReduce.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);
    jobConf.setNumReduceTasks(tabletInfos.length);
    FileOutputFormat.setOutputPath(jobConf, tempOutputPath);
    jobConf.setNumReduceTasks(maxReduce);
    jobConf.setMaxReduceAttempts(0);
    //<REDUCE>

    //Run Job
    JobClient.runJob(jobConf);

    fs.delete(tempOutputPath);
}

From source file:org.commoncrawl.hadoop.io.ARCInputFormat.java

License:Open Source License

/**
 * Sets the number of bytes to read at a time from each input stream.
 * /*from  w w  w .  j a  va 2  s. c o m*/
 * @param job
 *          the job to set the IO block size for
 * @param blockSize
 *          the IO block size to use
 * 
 * @see #P_IO_BLOCK_SIZE
 */
public static void setIOBlockSize(JobConf job, int blockSize) {
    job.setInt(P_IO_BLOCK_SIZE, blockSize);
}

From source file:org.commoncrawl.hadoop.io.ARCInputFormat.java

License:Open Source License

/**
 * Sets the number of bytes to use for IO buffering.
 * //from   ww  w  . j av  a 2  s  . c  om
 * @param job
 *          the job to set the buffer size for
 * @param bufferSize
 *          the number of bytes to use for IO buffering
 * 
 * @see #P_IO_BUFFER_SIZE
 */
public static void setIOBufferSize(JobConf job, int bufferSize) {
    job.setInt(P_IO_BUFFER_SIZE, bufferSize);
}

From source file:org.commoncrawl.hadoop.io.ARCSplitCalculator.java

License:Open Source License

/**
 * Sets the desired number of files per input split.
 * //  ww w .  j ava2 s  .  c  om
 * <p>
 * Default is 1.
 * 
 * @param job
 *          the job to set the number of files per split for
 * @param filesPerSplit
 *          the desired number of ARC files per split
 * 
 * @see #P_FILES_PER_SPLIT
 */
public static final void setFilesPerSplit(JobConf job, int filesPerSplit) {
    job.setInt(P_FILES_PER_SPLIT, filesPerSplit);
}

From source file:org.commoncrawl.hadoop.io.ARCSplitCalculator.java

License:Open Source License

/**
 * Sets the desired number of megabytes per split.
 * /*  www .ja v a2 s. c om*/
 * <p>
 * New files will be added to a split until the total size of the split
 * exceeds this threshold. Default is no limit.
 * 
 * @param job
 *          the job to set the number of megabytes per split for
 * @param mbPerSplit
 *          the desired number of megabytes per split
 */
public static final void setMegabytesPerSplit(JobConf job, int mbPerSplit) {
    job.setInt(P_MB_PER_SPLIT, mbPerSplit);
}