Example usage for org.apache.hadoop.mapred JobConf setOutputFormat

List of usage examples for org.apache.hadoop.mapred JobConf setOutputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputFormat.

Prototype

public void setOutputFormat(Class<? extends OutputFormat> theClass) 

Source Link

Document

Set the OutputFormat implementation for the map-reduce job.

Usage

From source file:edu.ucsb.cs.lsh.projection.ProjectionsGenerator.java

License:Apache License

public static void main(JobConf job) throws IOException {
    int nBits/*D*/, nFeatures/*K*/, nReducers;
    job.setJobName(ProjectionsGenerator.class.getSimpleName());
    FileSystem fs = FileSystem.get(job);

    nBits = job.getInt(ProjectionLshDriver.LSH_NBITS_PROPERTY, ProjectionLshDriver.LSH_NBITS_VALUE);
    nFeatures = readCollectionFeatureCount(fs, job);
    setParameters(nBits, nFeatures);//  www .  j ava2s  .  co m
    nReducers = job.getInt(ProjectionLshDriver.LSH_NREDUCER_PROPERTY, ProjectionLshDriver.LSH_NREDUCER_VALUE);
    Path inputPath = new Path(INPUT_DIR);
    Path outputPath = new Path(OUTPUT_DIR);
    if (fs.exists(outputPath))
        fs.delete(outputPath, true);
    if (fs.exists(inputPath))
        fs.delete(inputPath, true);

    SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, new Path(inputPath.toString() + "/file"),
            IntWritable.class, IntWritable.class);
    for (int i = 0; i < nReducers; i++)
        writer.append(new IntWritable(i), new IntWritable(i));
    writer.close();

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    FileOutputFormat.setCompressOutput(job, false);

    job.set("mapred.child.java.opts", "-Xmx2048m");
    job.setInt("mapred.map.max.attempts", 10);
    job.setInt("mapred.reduce.max.attempts", 10);

    job.setNumMapTasks(1);
    job.setNumReduceTasks(nReducers);

    job.setMapperClass(IdentityMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setReducerClass(ProjectionReducer.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(RandomVector.class);

    JobSubmitter.run(job, "LSH", job.getFloat(Config.THRESHOLD_PROPERTY, Config.THRESHOLD_VALUE));
}

From source file:edu.ucsb.cs.lsh.projection.SignaturesGenerator.java

License:Apache License

public static void main(String[] args) throws Exception {
    JobConf job = new JobConf(SignaturesGenerator.class);
    new GenericOptionsParser(job, args);
    job.setJobName(SignaturesGenerator.class.getSimpleName());
    int nBits = job.getInt(ProjectionLshDriver.LSH_NBITS_PROPERTY, ProjectionLshDriver.LSH_NBITS_VALUE);
    setParameters();//from   w  w w. j  a v a  2  s .c  o m
    FileSystem fs = FileSystem.get(job);
    prepareDistributedCache(job, fs, new Path(ProjectionsGenerator.OUTPUT_DIR));
    Path outputPath = new Path(OUTPUT_DIR);
    if (fs.exists(outputPath))
        fs.delete(outputPath);

    FileInputFormat.setInputPaths(job, INPUT_DIR);
    // Path(INPUT_DIR));
    FileOutputFormat.setOutputPath(job, outputPath);
    // FileOutputFormat.setCompressOutput(job, false);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.set("mapred.child.java.opts", "-Xmx2048m");
    job.setInt("mapred.map.max.attempts", 10);
    job.setInt("mapred.reduce.max.attempts", 10);
    job.setInt("mapred.task.timeout", 6000000);

    job.setMapperClass(SigMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(BitSignature.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(BitSignature.class);

    JobSubmitter.run(job, "LSH", -1);
}

From source file:edu.ucsb.cs.partitioning.cosine.CosinePartitioning.java

License:Apache License

public static JobConf setInputOutput(JobConf job, Path inputPath, Path outputPath) throws IOException {
    job.setInputFormat(NonSplitableSequenceInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, inputPath);
    FileSystem.get(job).delete(outputPath, true);
    job.setOutputFormat(MultiSeqOutput.class);
    MultiSeqOutput.setOutputPath(job, outputPath);
    return job;// w  ww  .ja  v  a2s.  c  o  m
}

From source file:edu.ucsb.cs.preprocessing.sequence.SeqWriter.java

License:Apache License

/**
 * Runs a MR job with maps only to convert input directory of numeric valued
 * records to hadoop sequence format. It assumes a text input of format of
 * [id feature weight ..] to be the format of input.
 *///from   w  w w  . j  av a2  s .c  o m
public static void writeSequence() throws IOException {

    JobConf job = new JobConf();
    job.setJobName("Convert text vectors to hadoop seqeunce ");
    job.setJarByClass(SeqWriter.class);

    job.setMapperClass(SeqMapper.class);
    job.setNumReduceTasks(0);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(FeatureWeightArrayWritable.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(FeatureWeightArrayWritable.class);

    job.setInputFormat(TextInputFormat.class);
    TextInputFormat.addInputPath(job, new Path(INPUT_DIR));
    FileSystem.get(job).delete(new Path(HashPagesDriver.IDS_FILE2), true);
    Path outputPath = new Path(OUTPUT_DIR);
    FileSystem.get(job).delete(outputPath, true);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(job, outputPath);

    JobSubmitter.run(job, "PREPROCESS", -1);
}

From source file:edu.ucsb.cs.sort.length.LengthSortMain.java

License:Apache License

/**
 * Sets the job configurations including the mapper and reducer classes to
 * do the sorting based on vector lengths.
 *///w  w w.j av  a  2 s .  c  o  m
public static void main(String[] args) throws IOException {

    JobConf job = new JobConf();
    new GenericOptionsParser(job, args);
    job.setJobName(LengthSortMain.class.getSimpleName());
    job.setJarByClass(LengthSortMain.class);
    job.setMapperClass(LengthSortMapper.class);
    job.setMapOutputKeyClass(FloatWritable.class);
    job.setMapOutputValueClass(IdFeatureWeightArrayWritable.class);

    job.setPartitionerClass(LengthRangePartitioner.class);

    job.setReducerClass(LengthSortReducer.class);
    job.setNumReduceTasks(job.getInt(SortDriver.NUM_REDUCE_PROPERTY, SortDriver.NUM_REDUCE_VALUE));
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(FeatureWeightArrayWritable.class);
    //
    // set input & output
    //
    String inputDir = SortDriver.INPUT_DIR;
    if (inputDir == null) {
        throw new UnsupportedOperationException("ERROR: input path not set");
    }
    job.setInputFormat(SequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, new Path(inputDir));
    Path outputPath = new Path(SortDriver.OUTPUT_DIR);
    FileSystem.get(job).delete(outputPath, true);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, outputPath);

    //
    // run
    //
    JobSubmitter.run(job, "Sort By Vector Lenghts", -1);
}

From source file:edu.ucsb.cs.sort.maxw.MaxwSortMain.java

License:Apache License

/**
 * Main method sets the job configurations including the mapper and reducer
 * classes to do the sorting./*  w  ww .j  a  v  a2s .c o  m*/
 */
public static void main(String[] args) throws IOException {

    JobConf job = new JobConf();
    new GenericOptionsParser(job, args);
    // ToolRunner.printGenericCommandUsage(System.out);
    job.setJobName(MaxwSortMain.class.getSimpleName());
    job.setJarByClass(MaxwSortMain.class);
    job.setMapperClass(MaxwSortMapper.class);
    job.setMapOutputKeyClass(FloatWritable.class);
    job.setMapOutputValueClass(IdFeatureWeightArrayWritable.class);

    job.setPartitionerClass(MaxwRangePartitioner.class);

    job.setReducerClass(MaxwSortReducer.class);
    job.setNumReduceTasks(job.getInt(SortDriver.NUM_REDUCE_PROPERTY, SortDriver.NUM_REDUCE_VALUE));
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(FeatureWeightArrayWritable.class);
    //
    // set input & output
    //
    String inputDir = SortDriver.INPUT_DIR;
    if (inputDir == null) {
        throw new UnsupportedOperationException("ERROR: input path not set");
    }
    job.setInputFormat(SequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, new Path(inputDir));
    Path outputPath = new Path(SortDriver.OUTPUT_DIR);
    FileSystem.get(job).delete(outputPath, true);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, outputPath);
    //
    // run
    //
    JobSubmitter.run(job, "Sort By infinity-Norm", -1);
}

From source file:edu.ucsb.cs.sort.norm.NormSortMain.java

License:Apache License

/**
 * Main method sets the job configurations including the mapper and reducer
 * classes to do the sorting. Some of the produced partitions might be
 * merged later to reflect the number of partitions chosen by the user.
 *//*ww  w  . jav a  2  s  . c o m*/
public static void main(String[] args) throws IOException {

    JobConf job = new JobConf();
    new GenericOptionsParser(job, args);
    job.setJobName("NormSort");
    job.setJarByClass(NormSortMain.class);
    job.setMapperClass(NormSortMapper.class);
    job.setMapOutputKeyClass(FloatWritable.class);
    job.setMapOutputValueClass(IdFeatureWeightArrayWritable.class);

    job.setPartitionerClass(NormRangePartitioner.class);

    job.setReducerClass(NormSortReducer.class);
    job.setNumReduceTasks(job.getInt(SortDriver.NUM_REDUCE_PROPERTY, SortDriver.NUM_REDUCE_VALUE));
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(FeatureWeightArrayWritable.class);
    //
    // set input & output
    //
    String inputDir = SortDriver.INPUT_DIR;
    if (inputDir == null) {
        throw new UnsupportedOperationException("ERROR: input path not set");
    }
    job.setInputFormat(SequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, new Path(inputDir));
    Path outputPath = new Path(SortDriver.OUTPUT_DIR);
    FileSystem.get(job).delete(outputPath, true);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, outputPath);
    //
    // run
    //
    JobSubmitter.run(job, "Sort By p-norm", -1);
}

From source file:edu.ucsb.cs.sort.signature.SigSortMain.java

License:Apache License

/**
 * Sets the job configurations including the mapper and reducer classes to
 * do the sorting based signatures.//from w w  w  .ja v a2s.co  m
 */
public static void main(String[] args) throws IOException {

    JobConf job = new JobConf();
    new GenericOptionsParser(job, args);
    job.setJobName(SigSortMain.class.getSimpleName());
    job.setJarByClass(SigSortMain.class);
    job.setMapperClass(SigSortMapper.class);
    job.setMapOutputKeyClass(BitSignature.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setPartitionerClass(SigRangePartitioner.class);

    job.setReducerClass(SigSortReducer.class);
    job.setNumReduceTasks(job.getInt(SortDriver.NUM_REDUCE_PROPERTY, SortDriver.NUM_REDUCE_VALUE));
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(BitSignature.class);
    //
    // set input & output
    //
    String inputDir = SortDriver.INPUT_DIR;
    if (inputDir == null) {
        throw new UnsupportedOperationException("ERROR: input path not set");
    }
    job.setInputFormat(SequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, new Path(inputDir));
    Path outputPath = new Path(OUTPUT_PATH);
    FileSystem.get(job).delete(outputPath, true);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, outputPath);

    //
    // run
    //
    JobSubmitter.run(job, "Sort By Signature Bytes", -1);
}

From source file:edu.umd.cloud9.collection.aquaint2.NumberAquaint2Documents.java

License:Apache License

/**
 * Runs this tool.//  w  w w.j a v a 2  s.  co m
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String outputFile = args[2];
    int mapTasks = 10;

    LOG.info("Tool: " + NumberAquaint2Documents.class.getCanonicalName());
    LOG.info(" - Input path: " + inputPath);
    LOG.info(" - Output path: " + outputPath);
    LOG.info(" - Output file: " + outputFile);

    JobConf conf = new JobConf(NumberAquaint2Documents.class);
    conf.setJobName(NumberAquaint2Documents.class.getSimpleName());

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(Aquaint2DocumentInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    Aquaint2DocnoMapping.writeDocnoData(new Path(outputPath + "/part-00000"), new Path(outputFile),
            FileSystem.get(conf));

    return 0;
}

From source file:edu.umd.cloud9.collection.clue.CountClueWarcRecords.java

License:Apache License

/**
 * Runs this tool.//from w  w  w  .ja v a  2  s  .  c  o  m
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(new Option(ORIGINAL_OPTION, "use original ClueWeb09 distribution"));
    options.addOption(new Option(REPACKED_OPTION, "use repacked SequenceFiles"));

    options.addOption(OptionBuilder.withArgName("path").hasArg()
            .withDescription("path: base path for 'original', actual path for 'repacked'").create(PATH_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("DocnoMapping data path")
            .create(MAPPING_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg()
            .withDescription("segment number (required if 'original')").create(SEGMENT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg()
            .withDescription("output file to write the number of records").create(COUNT_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    boolean repacked;
    if (cmdline.hasOption(REPACKED_OPTION)) {
        repacked = true;
    } else if (cmdline.hasOption(ORIGINAL_OPTION)) {
        repacked = false;
    } else {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Expecting either -original or -repacked");
        return -1;
    }

    if (!cmdline.hasOption(PATH_OPTION) || !cmdline.hasOption(MAPPING_OPTION)
            || (!repacked && !cmdline.hasOption(SEGMENT_OPTION))) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String path = cmdline.getOptionValue(PATH_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);

    int segment = 1;
    if (!repacked) {
        segment = Integer.parseInt(cmdline.getOptionValue(SEGMENT_OPTION));
    }

    LOG.info("Tool name: " + CountClueWarcRecords.class.getSimpleName());
    LOG.info(" - repacked: " + repacked);
    LOG.info(" - path: " + path);
    LOG.info(" - mapping file: " + mappingFile);
    if (!repacked) {
        LOG.info(" - segment number: " + segment);
    }

    FileSystem fs = FileSystem.get(getConf());
    int mapTasks = 10;

    JobConf conf = new JobConf(getConf(), CountClueWarcRecords.class);
    conf.setJobName(
            CountClueWarcRecords.class.getSimpleName() + (repacked ? ":" + path : ":segment" + segment));

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);

    if (repacked) {
        // Note, we have to add the files one by one, otherwise, SequenceFileInputFormat
        // thinks its a MapFile.
        for (FileStatus status : fs.listStatus(new Path(path))) {
            FileInputFormat.addInputPath(conf, status.getPath());
        }
    } else {
        ClueCollectionPathConstants.addEnglishCollectionPart(conf, path, segment);
    }

    DistributedCache.addCacheFile(new URI(mappingFile), conf);

    if (repacked) {
        conf.setInputFormat(SequenceFileInputFormat.class);
    } else {
        conf.setInputFormat(ClueWarcInputFormat.class);
    }

    conf.setOutputFormat(NullOutputFormat.class);
    conf.setMapperClass(MyMapper.class);

    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Records.PAGES).getCounter();

    LOG.info("Read " + numDocs + " docs.");

    if (cmdline.hasOption(COUNT_OPTION)) {
        String f = cmdline.getOptionValue(COUNT_OPTION);
        FSDataOutputStream out = fs.create(new Path(f));
        out.write(new Integer(numDocs).toString().getBytes());
        out.close();
    }

    return 0;
}