Example usage for org.apache.hadoop.mapreduce Job setMapOutputValueClass

List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputValueClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputValueClass.

Prototype

public void setMapOutputValueClass(Class<?> theClass) throws IllegalStateException 

Source Link

Document

Set the value class for the map output data.

Usage

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.ConfigurationHelper.java

License:Apache License

/**
 * Job configurator//from   w  w  w  . j a v a 2 s. co  m
 *
 * @param job                      job instance
 * @param jarByClass               class of the jar
 * @param mapperClass              mapper
 * @param reducerClass             reducer
 * @param commaSeparatedInputFiles input paths
 * @param outputPath               output
 * @throws IOException I/O exception
 */
public static void configureJob(Job job, Class<?> jarByClass, Class<? extends Mapper> mapperClass,
        Class<? extends Reducer> reducerClass, String commaSeparatedInputFiles, String outputPath)
        throws IOException {
    job.setJarByClass(jarByClass);
    job.setJobName(jarByClass.getName());

    // mapper
    job.setMapperClass(mapperClass);

    // reducer
    job.setReducerClass(reducerClass);

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    // prevent producing empty files
    LazyOutputFormat.setOutputFormatClass(job, WARCOutputFormat.class);

    // intermediate data
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(WARCWritable.class);

    // output data
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(WARCWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase1FullJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());
    // set from the command line

    job.setJarByClass(Phase1FullJob.class);
    job.setJobName(Phase1FullJob.class.getName());

    // mapper/*from   w ww.ja v a  2 s . c o m*/
    job.setMapperClass(MapperClass.class);

    // we will compress the mapper's output (use fast Snappy compressor)
    job.getConfiguration().setBoolean(Job.MAP_OUTPUT_COMPRESS, true);
    job.getConfiguration().setClass(Job.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);

    // reducer
    job.setReducerClass(SimpleWarcWriterReducer.class);

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(WARCOutputFormat.class);

    // mapper output data
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(WARCWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase2ExactMatchDeDuplication.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());
    //set from the command line

    job.setJarByClass(Phase2ExactMatchDeDuplication.class);
    job.setJobName(Phase2ExactMatchDeDuplication.class.getName());

    // mapper//  ww w . ja  va  2 s. com
    job.setMapperClass(ExactMatchDetectionMapper.class);

    // we will compress the mapper's output (use fast Snappy compressor)
    job.getConfiguration().setBoolean(Job.MAP_OUTPUT_COMPRESS, true);
    job.getConfiguration().setClass(Job.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);

    // reducer
    job.setReducerClass(UniqueWarcWriterReducer.class);
    // no combiner, as the output classes in mapper and reducer are different!

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(WARCOutputFormat.class);

    // mapper output data
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(WARCWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase3Step1ExtractNearDupInfo.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());

    job.setJarByClass(Phase3Step1ExtractNearDupInfo.class);
    job.setJobName(Phase3Step1ExtractNearDupInfo.class.getName());

    // mapper//from   w  w w  .j av a2 s.c o  m
    job.setMapperClass(MapperClass.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(DocumentInfo.class);

    // reducer
    job.setReducerClass(DeDuplicationTextOutputReducer.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(List.class);

    job.setInputFormatClass(WARCInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, DocumentInfoOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase3Step2DistinctDataJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Job job = Job.getInstance(getConf());
    job.setJarByClass(Phase3Step2DistinctDataJob.class);
    job.setJobName(Phase3Step2DistinctDataJob.class.getName());

    //mapper//www.j  a va2  s . c o m
    job.setMapperClass(RemoveRedundantDataMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(NullWritable.class);

    //reducer
    job.setReducerClass(RemoveRedundantDataReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    //paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    job.setInputFormatClass(TextInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    //i/o paths
    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase3Step3NearDupTuplesCreation.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());

    job.setJarByClass(Phase3Step3NearDupTuplesCreation.class);
    job.setJobName(Phase3Step3NearDupTuplesCreation.class.getName());

    // mapper//w w  w. jav a  2s  .c  om
    job.setMapperClass(CreateTuplesMapper.class);
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(TreeSet.class);

    job.setInputFormatClass(TextInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setNumReduceTasks(0); //must be added or the mapper wont be called

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase4RemoveDuplicatesUsingReduceSideJoins.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());

    job.setJarByClass(Phase4RemoveDuplicatesUsingReduceSideJoins.class);
    job.setJobName(Phase4RemoveDuplicatesUsingReduceSideJoins.class.getName());

    // paths// w  w w.j ava  2s .  co  m
    // text files of ids to be deleted
    String textFilePath = args[0];
    // corpus with *.warc.gz
    String commaSeparatedInputFiles = args[1];
    // output
    String outputPath = args[2];

    //second input the look up text file
    MultipleInputs.addInputPath(job, new Path(textFilePath), TextInputFormat.class, JoinTextMapper.class);
    //first input the data set (check comma separated availability)
    MultipleInputs.addInputPath(job, new Path(commaSeparatedInputFiles), WARCInputFormat.class,
            JoinWARCMapper.class);

    job.setPartitionerClass(SourceJoiningKeyPartitioner.class);
    job.setGroupingComparatorClass(SourceJoiningGroupingComparator.class);

    job.setMapOutputKeyClass(CompositeKey.class);
    job.setMapOutputValueClass(WARCWritable.class);

    job.setReducerClass(JoinReducer.class);

    job.setOutputFormatClass(WARCOutputFormat.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(WARCWritable.class);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.statistics.LangLicenseStatistics.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());

    ConfigurationHelper.configureJob(job, LangLicenseStatistics.class, MapperClass.class, ReducerClass.class,
            args[0], args[1]);//from   ww  w.j  a  v  a  2s.c  om

    // intermediate data
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(MapWritable.class);

    // output data
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setOutputFormatClass(TextOutputFormat.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:demo.SsJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    Job job = new Job(conf, "secondary sort");

    job.setJarByClass(SsJob.class);
    job.setPartitionerClass(NaturalKeyPartitioner.class);
    job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class);
    job.setSortComparatorClass(CompositeKeyComparator.class);

    job.setMapOutputKeyClass(StockKey.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(SsMapper.class);
    job.setReducerClass(SsReducer.class);

    job.waitForCompletion(true);//from   w w w .  java 2s  .  c o  m

    return 0;
}

From source file:diamondmapreduce.DiamondMapReduce.java

License:Apache License

int launchHamond(String[] arguments) throws Exception {

    //extract diamond, query, reference and output from array
    String diamond = arguments[0];
    String query = arguments[1];/*from w  w  w. j a  v a  2 s.  c  om*/
    String dataBase = arguments[2];
    String outPut = arguments[3];

    //set Hadoop configuration
    Job job = Job.getInstance(getConf(), "DIAMOND");
    Configuration conf = job.getConfiguration();
    SetConf.setHadoopConf(conf);

    //get user name
    userName = HadoopUser.getHadoopUser();

    //delete all existing DIAMOND files under current Hadoop user
    DeleteHDFSFiles.deleteAllFiles(userName);

    //make Hamond directory on HDFS
    MakeHamondHDFSdir.makedir(conf, userName);

    //make DIAMOND database on local then copy to HDFS with query and delete local database
    MakeDB.makeDB(diamond, dataBase);

    //copy DIAMOND bin, query and local database file to HDFS
    CopyFromLocal.copyFromLocal(conf, diamond, query, dataBase, userName);

    //pass query name and database name to mappers
    conf.set(QUERY, query);
    conf.set(DATABASE, dataBase + ".dmnd");
    String[] subArgs = Arrays.copyOfRange(arguments, 4, arguments.length);
    conf.setStrings("DIAMOND-arguments", subArgs);
    conf.setStrings(OUTPUT, outPut);

    //add DIAMOND bin and database into distributed cache
    job.addCacheFile(new URI("/user/" + userName + "/Hamond/diamond"));
    job.addCacheFile(new URI("/user/" + userName + "/Hamond/" + new Path(dataBase).getName() + ".dmnd"));

    //set job input and output paths
    FileInputFormat.addInputPath(job, new Path("/user/" + userName + "/Hamond/" + new Path(query).getName()));
    FileOutputFormat.setOutputPath(job, new Path("/user/" + userName + "/Hamond/out"));

    //set job driver and mapper
    job.setJarByClass(DiamondMapReduce.class);
    job.setMapperClass(DiamondMapper.class);

    //set job input format into customized multilines format
    job.setInputFormatClass(CustomNLineFileInputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setNumReduceTasks(0);

    return job.waitForCompletion(true) ? 0 : 1;

}