Example usage for org.apache.hadoop.mapred JobConf setInputFormat

List of usage examples for org.apache.hadoop.mapred JobConf setInputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setInputFormat.

Prototype

public void setInputFormat(Class<? extends InputFormat> theClass) 

Source Link

Document

Set the InputFormat implementation for the map-reduce job.

Usage

From source file:de.tudarmstadt.lt.n2n.hadoop.FilterByVocabularyJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), FilterByVocabularyJob.class);
    conf.setJobName(FilterByVocabularyJob.class.getSimpleName());

    conf.setMapperClass(FilterByVocabularyMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    String word_list_file = conf.get(SHARED_CONSTANTS.PARAM_WORD_LIST);
    if (word_list_file == null)
        throw new MissingArgumentException(
                "Please specify word list with '-Dnlkg.filterbywordsfile=<path-to-file-in-hdfs>'.");

    DistributedCache.addFileToClassPath(new Path(word_list_file), conf);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);/*from www  .  j a  va2s.co  m*/
    return 0;
}

From source file:de.tudarmstadt.lt.n2n.hadoop.FlipJoBims.java

License:Apache License

public static void main(String[] args) throws Exception {

    JobConf conf = new JobConf(FlipJoBims.class);

    /* begin necessary for UKP cluster */
    conf.setMemoryForMapTask(1000L); // 1 GB /* necessary for UKP cdh3 */
    conf.setMemoryForReduceTask(1000L); // 1 GB /* necessary for UKP cdh3 */
    FileOutputFormat.setCompressOutput(conf, true); // compress output
    FileOutputFormat.setOutputCompressorClass(conf,
            org.apache.hadoop.io.compress.BZip2Codec.class); /* use the bzip2 codec for compression */
    conf.setCompressMapOutput(true); // compress mapper output
    /* end necessary for UKP cluster */

    conf.setJobName(FlipJoBims.class.getSimpleName());
    args = new GenericOptionsParser(conf, args).getRemainingArgs();

    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(FlipJoBims.Map.class);
    conf.setNumReduceTasks(0);//from  ww w .  j  av  a2s .co  m
    // conf.setReducerClass(IdentityReducer.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setOutputKeyClass(Text.class);

    conf.setMapOutputValueClass(Text.class);
    conf.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);

}

From source file:de.tudarmstadt.lt.n2n.hadoop.GoogleSyntacticsJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), GoogleSyntacticsJob.class);
    conf.setJobName(GoogleSyntacticsJob.class.getSimpleName());

    conf.setMapperClass(GoogleSyntacticsJob3Mapper.class);
    conf.setReducerClass(GoogleSyntacticsJob3Reducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setOutputKeyClass(JoBimFormat.class);
    conf.setOutputValueClass(IntWritable.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    // delete output path for testing purposes
    // FileSystem.get(conf).delete(new Path(args[1]), true);

    String extractorConfigurationFiles = conf.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS);
    if (extractorConfigurationFiles == null) {
        extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ',');
        System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n",
                SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles);
        conf.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles);
    }/*from   w  w  w. j a  va 2s  .  c o  m*/

    String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(",");
    for (int i = 0; i < extractorConfigurationFilesArr.length; i++)
        DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), conf);

    JobClient.runJob(conf);
    return 0;
}

From source file:de.tudarmstadt.lt.n2n.hadoop.pipetests.GoogleSyntacticsJob2.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), GoogleSyntacticsJob2.class);
    conf.setJobName(GoogleSyntacticsJob2.class.getSimpleName());

    conf.setMapperClass(GoogleSyntacticsJob2Mapper.class);
    conf.setReducerClass(GoogleSyntacticsJob2Reducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    // conf.setMapOutputKeyClass(Text.class);
    // conf.setMapOutputValueClass(NullWritable.class);

    conf.setOutputKeyClass(JoBimFormat.class);
    conf.setOutputValueClass(IntWritable.class);

    args = new GenericOptionsParser(conf, args).getRemainingArgs();
    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    // delete output path for testing purposes
    // FileSystem.get(conf).delete(new Path(args[1]), true);

    String extractorConfigurationFiles = conf.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS);
    if (extractorConfigurationFiles == null) {
        extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ',');
        System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n",
                SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles);
        conf.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles);
    }/*w  ww .  j a  va 2  s.  c  o m*/

    String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(",");
    for (int i = 0; i < extractorConfigurationFilesArr.length; i++)
        DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), conf);

    JobClient.runJob(conf);
    return 0;
}

From source file:de.tudarmstadt.lt.n2n.hadoop.pipetests.GoogleSyntacticsJob4.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), GoogleSyntacticsJob4.class);
    conf.setJobName(GoogleSyntacticsJob4.class.getSimpleName());

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    // delete output path for testing purposes
    // FileSystem.get(conf).delete(new Path(args[1]), true);

    String extractorConfigurationFiles = conf.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS);
    if (extractorConfigurationFiles == null) {
        extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ',');
        System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n",
                SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles);
        conf.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles);
    }//w  w w.  j av a2 s .co m

    String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(",");
    for (int i = 0; i < extractorConfigurationFilesArr.length; i++)
        DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), conf);

    conf.setMapperClass(GoogleSyntacticsJob4Mapper.class);
    conf.setInputFormat(TextInputFormat.class);
    conf.setMapOutputKeyClass(NullWritable.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setNumReduceTasks(0);
    conf.setCombinerClass(IdentityReducer.class);

    JobClient.runJob(conf);
    return 0;
}

From source file:de.tudarmstadt.lt.n2n.hadoop.RelationToOneHoleTransformerJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), RelationToOneHoleTransformerJob.class);

    conf.setJobName(RelationToOneHoleTransformerJob.class.getSimpleName());
    args = new GenericOptionsParser(conf, args).getRemainingArgs();

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(RelationToOneHoleTransformerJob.Map.class);
    conf.setNumReduceTasks(0);//w  w  w . j a  v  a  2 s  .  c  o m
    // conf.setReducerClass(IdentityReducer.class); // sort or no sort? - that is here the question

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));
    JobClient.runJob(conf);

    return 0;
}

From source file:de.tudarmstadt.lt.n2n.hadoop.RemoveExactDuplicatesJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), RemoveExactDuplicatesJob.class);
    conf.setJobName(RemoveExactDuplicatesJob.class.getSimpleName());

    conf.setMapperClass(LineMapper.class);
    conf.setReducerClass(KeyReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(NullWritable.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    // delete output path for testing purposes
    // FileSystem.get(conf).delete(new Path(args[1]), true);

    JobClient.runJob(conf);/*from  www .  j  a va 2 s.com*/
    return 0;
}

From source file:de.tudarmstadt.lt.nlkg.ConvertInvertSVO.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), ConvertInvertSVO.class);
    conf.setJobName(ConvertInvertSVO.class.getSimpleName());

    conf.setMapperClass(ConversionMapper.class);
    conf.setCombinerClass(IdentityReducer.class);
    conf.setReducerClass(IdentityReducer.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(ConvertedWritable.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);//from ww  w .  j a  va 2  s.  com
    return 0;
}

From source file:de.tudarmstadt.lt.nlkg.ConvertSVO.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), ConvertSVO.class);
    conf.setJobName(ConvertSVO.class.getSimpleName());

    conf.setMapperClass(ConversionMapper.class);
    conf.setCombinerClass(IdentityReducer.class);
    conf.setReducerClass(IdentityReducer.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(ConvertedWritable.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);/* w w  w .j  a  v a 2 s.c o m*/
    return 0;
}

From source file:de.tudarmstadt.ukp.dkpro.bigdata.examples.CasConsumerExample.java

License:Apache License

@Override
public void configure(JobConf job) {
    job.set("mapreduce.job.queuename", "smalljob");
    job.setInputFormat(Text2CASInputFormat.class);
}