Example usage for org.apache.hadoop.mapred JobConf setOutputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputFormat.

Prototype

public void setOutputFormat(Class<? extends OutputFormat> theClass)

Source Link

Document

Set the OutputFormat implementation for the map-reduce job.

Usage

From source file:edu.ucsb.cs.lsh.projection.ProjectionsGenerator.java

License:Apache License

public static void main(JobConf job) throws IOException {
    int nBits/*D*/, nFeatures/*K*/, nReducers;
    job.setJobName(ProjectionsGenerator.class.getSimpleName());
    FileSystem fs = FileSystem.get(job);

    nBits = job.getInt(ProjectionLshDriver.LSH_NBITS_PROPERTY, ProjectionLshDriver.LSH_NBITS_VALUE);
    nFeatures = readCollectionFeatureCount(fs, job);
    setParameters(nBits, nFeatures);//  www .  j ava2s  .  co m
    nReducers = job.getInt(ProjectionLshDriver.LSH_NREDUCER_PROPERTY, ProjectionLshDriver.LSH_NREDUCER_VALUE);
    Path inputPath = new Path(INPUT_DIR);
    Path outputPath = new Path(OUTPUT_DIR);
    if (fs.exists(outputPath))
        fs.delete(outputPath, true);
    if (fs.exists(inputPath))
        fs.delete(inputPath, true);

    SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, new Path(inputPath.toString() + "/file"),
            IntWritable.class, IntWritable.class);
    for (int i = 0; i < nReducers; i++)
        writer.append(new IntWritable(i), new IntWritable(i));
    writer.close();

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    FileOutputFormat.setCompressOutput(job, false);

    job.set("mapred.child.java.opts", "-Xmx2048m");
    job.setInt("mapred.map.max.attempts", 10);
    job.setInt("mapred.reduce.max.attempts", 10);

    job.setNumMapTasks(1);
    job.setNumReduceTasks(nReducers);

    job.setMapperClass(IdentityMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setReducerClass(ProjectionReducer.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(RandomVector.class);

    JobSubmitter.run(job, "LSH", job.getFloat(Config.THRESHOLD_PROPERTY, Config.THRESHOLD_VALUE));
}

From source file:edu.ucsb.cs.lsh.projection.SignaturesGenerator.java

License:Apache License

public static void main(String[] args) throws Exception {
    JobConf job = new JobConf(SignaturesGenerator.class);
    new GenericOptionsParser(job, args);
    job.setJobName(SignaturesGenerator.class.getSimpleName());
    int nBits = job.getInt(ProjectionLshDriver.LSH_NBITS_PROPERTY, ProjectionLshDriver.LSH_NBITS_VALUE);
    setParameters();//from   w  w w. j  a v a  2  s .c  o m
    FileSystem fs = FileSystem.get(job);
    prepareDistributedCache(job, fs, new Path(ProjectionsGenerator.OUTPUT_DIR));
    Path outputPath = new Path(OUTPUT_DIR);
    if (fs.exists(outputPath))
        fs.delete(outputPath);

    FileInputFormat.setInputPaths(job, INPUT_DIR);
    // Path(INPUT_DIR));
    FileOutputFormat.setOutputPath(job, outputPath);
    // FileOutputFormat.setCompressOutput(job, false);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.set("mapred.child.java.opts", "-Xmx2048m");
    job.setInt("mapred.map.max.attempts", 10);
    job.setInt("mapred.reduce.max.attempts", 10);
    job.setInt("mapred.task.timeout", 6000000);

    job.setMapperClass(SigMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(BitSignature.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(BitSignature.class);

    JobSubmitter.run(job, "LSH", -1);
}

From source file:edu.ucsb.cs.partitioning.cosine.CosinePartitioning.java

License:Apache License

public static JobConf setInputOutput(JobConf job, Path inputPath, Path outputPath) throws IOException {
    job.setInputFormat(NonSplitableSequenceInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, inputPath);
    FileSystem.get(job).delete(outputPath, true);
    job.setOutputFormat(MultiSeqOutput.class);
    MultiSeqOutput.setOutputPath(job, outputPath);
    return job;// w  ww  .ja  v  a2s.  c  o  m
}

From source file:edu.ucsb.cs.preprocessing.sequence.SeqWriter.java

License:Apache License

/**
 * Runs a MR job with maps only to convert input directory of numeric valued
 * records to hadoop sequence format. It assumes a text input of format of
 * [id feature weight ..] to be the format of input.
 *///from   w  w w  . j  av a2  s .c  o m
public static void writeSequence() throws IOException {

    JobConf job = new JobConf();
    job.setJobName("Convert text vectors to hadoop seqeunce ");
    job.setJarByClass(SeqWriter.class);

    job.setMapperClass(SeqMapper.class);
    job.setNumReduceTasks(0);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(FeatureWeightArrayWritable.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(FeatureWeightArrayWritable.class);

    job.setInputFormat(TextInputFormat.class);
    TextInputFormat.addInputPath(job, new Path(INPUT_DIR));
    FileSystem.get(job).delete(new Path(HashPagesDriver.IDS_FILE2), true);
    Path outputPath = new Path(OUTPUT_DIR);
    FileSystem.get(job).delete(outputPath, true);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(job, outputPath);

    JobSubmitter.run(job, "PREPROCESS", -1);
}

From source file:edu.ucsb.cs.sort.length.LengthSortMain.java

License:Apache License

/**
 * Sets the job configurations including the mapper and reducer classes to
 * do the sorting based on vector lengths.
 *///w  w w.j av  a  2 s .  c  o  m
public static void main(String[] args) throws IOException {

    JobConf job = new JobConf();
    new GenericOptionsParser(job, args);
    job.setJobName(LengthSortMain.class.getSimpleName());
    job.setJarByClass(LengthSortMain.class);
    job.setMapperClass(LengthSortMapper.class);
    job.setMapOutputKeyClass(FloatWritable.class);
    job.setMapOutputValueClass(IdFeatureWeightArrayWritable.class);

    job.setPartitionerClass(LengthRangePartitioner.class);

    job.setReducerClass(LengthSortReducer.class);
    job.setNumReduceTasks(job.getInt(SortDriver.NUM_REDUCE_PROPERTY, SortDriver.NUM_REDUCE_VALUE));
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(FeatureWeightArrayWritable.class);
    //
    // set input & output
    //
    String inputDir = SortDriver.INPUT_DIR;
    if (inputDir == null) {
        throw new UnsupportedOperationException("ERROR: input path not set");
    }
    job.setInputFormat(SequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, new Path(inputDir));
    Path outputPath = new Path(SortDriver.OUTPUT_DIR);
    FileSystem.get(job).delete(outputPath, true);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, outputPath);

    //
    // run
    //
    JobSubmitter.run(job, "Sort By Vector Lenghts", -1);
}

From source file:edu.ucsb.cs.sort.maxw.MaxwSortMain.java

License:Apache License

/**
 * Main method sets the job configurations including the mapper and reducer
 * classes to do the sorting./*  w  ww .j  a  v  a2s .c o  m*/
 */
public static void main(String[] args) throws IOException {

    JobConf job = new JobConf();
    new GenericOptionsParser(job, args);
    // ToolRunner.printGenericCommandUsage(System.out);
    job.setJobName(MaxwSortMain.class.getSimpleName());
    job.setJarByClass(MaxwSortMain.class);
    job.setMapperClass(MaxwSortMapper.class);
    job.setMapOutputKeyClass(FloatWritable.class);
    job.setMapOutputValueClass(IdFeatureWeightArrayWritable.class);

    job.setPartitionerClass(MaxwRangePartitioner.class);

    job.setReducerClass(MaxwSortReducer.class);
    job.setNumReduceTasks(job.getInt(SortDriver.NUM_REDUCE_PROPERTY, SortDriver.NUM_REDUCE_VALUE));
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(FeatureWeightArrayWritable.class);
    //
    // set input & output
    //
    String inputDir = SortDriver.INPUT_DIR;
    if (inputDir == null) {
        throw new UnsupportedOperationException("ERROR: input path not set");
    }
    job.setInputFormat(SequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, new Path(inputDir));
    Path outputPath = new Path(SortDriver.OUTPUT_DIR);
    FileSystem.get(job).delete(outputPath, true);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, outputPath);
    //
    // run
    //
    JobSubmitter.run(job, "Sort By infinity-Norm", -1);
}

From source file:edu.ucsb.cs.sort.norm.NormSortMain.java

License:Apache License

/**
 * Main method sets the job configurations including the mapper and reducer
 * classes to do the sorting. Some of the produced partitions might be
 * merged later to reflect the number of partitions chosen by the user.
 *//*ww  w  . jav a  2  s  . c o m*/
public static void main(String[] args) throws IOException {

    JobConf job = new JobConf();
    new GenericOptionsParser(job, args);
    job.setJobName("NormSort");
    job.setJarByClass(NormSortMain.class);
    job.setMapperClass(NormSortMapper.class);
    job.setMapOutputKeyClass(FloatWritable.class);
    job.setMapOutputValueClass(IdFeatureWeightArrayWritable.class);

    job.setPartitionerClass(NormRangePartitioner.class);

    job.setReducerClass(NormSortReducer.class);
    job.setNumReduceTasks(job.getInt(SortDriver.NUM_REDUCE_PROPERTY, SortDriver.NUM_REDUCE_VALUE));
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(FeatureWeightArrayWritable.class);
    //
    // set input & output
    //
    String inputDir = SortDriver.INPUT_DIR;
    if (inputDir == null) {
        throw new UnsupportedOperationException("ERROR: input path not set");
    }
    job.setInputFormat(SequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, new Path(inputDir));
    Path outputPath = new Path(SortDriver.OUTPUT_DIR);
    FileSystem.get(job).delete(outputPath, true);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, outputPath);
    //
    // run
    //
    JobSubmitter.run(job, "Sort By p-norm", -1);
}

From source file:edu.ucsb.cs.sort.signature.SigSortMain.java

License:Apache License

/**
 * Sets the job configurations including the mapper and reducer classes to
 * do the sorting based signatures.//from w w  w  .ja v a2s.co  m
 */
public static void main(String[] args) throws IOException {

    JobConf job = new JobConf();
    new GenericOptionsParser(job, args);
    job.setJobName(SigSortMain.class.getSimpleName());
    job.setJarByClass(SigSortMain.class);
    job.setMapperClass(SigSortMapper.class);
    job.setMapOutputKeyClass(BitSignature.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setPartitionerClass(SigRangePartitioner.class);

    job.setReducerClass(SigSortReducer.class);
    job.setNumReduceTasks(job.getInt(SortDriver.NUM_REDUCE_PROPERTY, SortDriver.NUM_REDUCE_VALUE));
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(BitSignature.class);
    //
    // set input & output
    //
    String inputDir = SortDriver.INPUT_DIR;
    if (inputDir == null) {
        throw new UnsupportedOperationException("ERROR: input path not set");
    }
    job.setInputFormat(SequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, new Path(inputDir));
    Path outputPath = new Path(OUTPUT_PATH);
    FileSystem.get(job).delete(outputPath, true);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, outputPath);

    //
    // run
    //
    JobSubmitter.run(job, "Sort By Signature Bytes", -1);
}

From source file:edu.umd.cloud9.collection.aquaint2.NumberAquaint2Documents.java

License:Apache License

/**
 * Runs this tool.//  w  w w.j a v a 2  s.  co m
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String outputFile = args[2];
    int mapTasks = 10;

    LOG.info("Tool: " + NumberAquaint2Documents.class.getCanonicalName());
    LOG.info(" - Input path: " + inputPath);
    LOG.info(" - Output path: " + outputPath);
    LOG.info(" - Output file: " + outputFile);

    JobConf conf = new JobConf(NumberAquaint2Documents.class);
    conf.setJobName(NumberAquaint2Documents.class.getSimpleName());

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(Aquaint2DocumentInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    Aquaint2DocnoMapping.writeDocnoData(new Path(outputPath + "/part-00000"), new Path(outputFile),
            FileSystem.get(conf));

    return 0;
}

From source file:edu.umd.cloud9.collection.clue.CountClueWarcRecords.java

License:Apache License

/**
 * Runs this tool.//from w  w  w  .ja v a  2  s  .  c  o  m
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(new Option(ORIGINAL_OPTION, "use original ClueWeb09 distribution"));
    options.addOption(new Option(REPACKED_OPTION, "use repacked SequenceFiles"));

    options.addOption(OptionBuilder.withArgName("path").hasArg()
            .withDescription("path: base path for 'original', actual path for 'repacked'").create(PATH_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("DocnoMapping data path")
            .create(MAPPING_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg()
            .withDescription("segment number (required if 'original')").create(SEGMENT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg()
            .withDescription("output file to write the number of records").create(COUNT_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    boolean repacked;
    if (cmdline.hasOption(REPACKED_OPTION)) {
        repacked = true;
    } else if (cmdline.hasOption(ORIGINAL_OPTION)) {
        repacked = false;
    } else {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Expecting either -original or -repacked");
        return -1;
    }

    if (!cmdline.hasOption(PATH_OPTION) || !cmdline.hasOption(MAPPING_OPTION)
            || (!repacked && !cmdline.hasOption(SEGMENT_OPTION))) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String path = cmdline.getOptionValue(PATH_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);

    int segment = 1;
    if (!repacked) {
        segment = Integer.parseInt(cmdline.getOptionValue(SEGMENT_OPTION));
    }

    LOG.info("Tool name: " + CountClueWarcRecords.class.getSimpleName());
    LOG.info(" - repacked: " + repacked);
    LOG.info(" - path: " + path);
    LOG.info(" - mapping file: " + mappingFile);
    if (!repacked) {
        LOG.info(" - segment number: " + segment);
    }

    FileSystem fs = FileSystem.get(getConf());
    int mapTasks = 10;

    JobConf conf = new JobConf(getConf(), CountClueWarcRecords.class);
    conf.setJobName(
            CountClueWarcRecords.class.getSimpleName() + (repacked ? ":" + path : ":segment" + segment));

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);

    if (repacked) {
        // Note, we have to add the files one by one, otherwise, SequenceFileInputFormat
        // thinks its a MapFile.
        for (FileStatus status : fs.listStatus(new Path(path))) {
            FileInputFormat.addInputPath(conf, status.getPath());
        }
    } else {
        ClueCollectionPathConstants.addEnglishCollectionPart(conf, path, segment);
    }

    DistributedCache.addCacheFile(new URI(mappingFile), conf);

    if (repacked) {
        conf.setInputFormat(SequenceFileInputFormat.class);
    } else {
        conf.setInputFormat(ClueWarcInputFormat.class);
    }

    conf.setOutputFormat(NullOutputFormat.class);
    conf.setMapperClass(MyMapper.class);

    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Records.PAGES).getCounter();

    LOG.info("Read " + numDocs + " docs.");

    if (cmdline.hasOption(COUNT_OPTION)) {
        String f = cmdline.getOptionValue(COUNT_OPTION);
        FSDataOutputStream out = fs.create(new Path(f));
        out.write(new Integer(numDocs).toString().getBytes());
        out.close();
    }

    return 0;
}