List of usage examples for org.apache.hadoop.mapred JobConf setInputFormat
public void setInputFormat(Class<? extends InputFormat> theClass)
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.AggrPerFtUniquePositions.java
License:Apache License
/** * Set the job configuration, classes and run the job. */// w ww .ja va 2 s. com @SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); // JobConf conf = new JobConf(AggrPerFtUniquePositions.class); conf.setJobName("AggrPerFtUniquePositions " + args[0] + " " + args[1]); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); /* * use compression */ conf.set("mapred.output.compress", "true"); conf.set("mapred.map.output.compress", "true"); conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); /* set the maximum number of task per node */ int maptasks = 120; conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.map.tasks", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); int reducetasks = 60; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); /* * heap size for the job */ conf.set("mapred.child.java.opts", "-Xmx1500m"); /* * how much virtual memory the entire process tree of each map/reduce * task will use */ conf.set("mapred.job.map.memory.mb", "2048"); conf.set("mapred.job.reduce.memory.mb", "2048"); JobClient.runJob(conf); }
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.AggrPerFtWithParams.java
License:Apache License
/** * Set the job configuration, classes and run the job. *///from w ww. j a va2 s. c om @SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); /* set the maximum number of task per node */ int maptasks = 120; conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.map.tasks", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); int reducetasks = 120; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); /* * heap size for the job */ conf.set("mapred.child.java.opts", "-Xmx1500m"); /* * how much virtual memory the entire process tree of each map/reduce * task will use */ conf.set("mapred.job.map.memory.mb", "2048"); conf.set("mapred.job.reduce.memory.mb", "2048"); JobClient.runJob(conf); }
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.AggrPerWord.java
License:Apache License
/** * Set the job configuration, classes and run the job. *//*from w ww.j a v a 2 s. co m*/ @SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); /* * use compression */ conf.set("mapred.output.compress", "true"); conf.set("mapred.map.output.compress", "true"); conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); /* set the maximum number of task per node */ int maptasks = 120; conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.map.tasks", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); int reducetasks = 120; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); /* * heap size for the job */ conf.set("mapred.child.java.opts", "-Xmx1500m"); /* * how much virtual memory the entire process tree of each map/reduce * task will use */ conf.set("mapred.job.map.memory.mb", "2048"); conf.set("mapred.job.reduce.memory.mb", "2048"); JobClient.runJob(conf); }
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.CleanContext.java
License:Apache License
@SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(IntSumReducer.class); conf.setReducerClass(IntSumReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); /* number of milliseconds before killing a not responding task */ conf.set("mapred.task.timeout", "600000"); /* change to 128mb */ conf.set("dfs.block.size", "134217728"); /* set the maximum number of task per node */ int maptasks = 100; /*// w ww. j a va 2 s. com * Number of map tasks to deploy on each machine. 0.5 to 2 * * (cores/node) */ conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); /* * The default number of map tasks per job. Typically set to a prime * several times greater than number of available hosts. */ conf.set("mapred.map.tasks", "" + maptasks); int reducetasks = 120; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); conf.set("mapred.job.map.memory.mb", "3000"); conf.set("mapred.job.reduce.memory.mb", "3000"); JobClient.runJob(conf); }
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.FeatureCount.java
License:Apache License
@SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(IntSumReducer.class); conf.setReducerClass(IntSumReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); int maptasks = 120; /* set the maximum number of task per node */ conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.map.tasks", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); int reducetasks = 100; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); /*//from w ww . j a va2 s . com * how much virtual memory the entire process tree of each map/reduce * task will use */ conf.set("mapred.job.map.memory.mb", "2048"); conf.set("mapred.job.reduce.memory.mb", "2048"); conf.set("dfs.replication", "1"); JobClient.runJob(conf); }
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.SimCounts.java
License:Apache License
@SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(DoubleSumReducer.class); conf.setReducerClass(DoubleSumReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); /* number of milliseconds before killing a not responding task */ conf.set("mapred.task.timeout", "600000"); /* change to 128mb */ conf.set("dfs.block.size", "134217728"); /* set the maximum number of task per node */ int maptasks = 100; /* Number of map tasks to deploy on each machine. 0.5 to 2 * (cores/node) */ conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); /* The default number of map tasks per job. Typically set to a prime several times greater than number of available hosts. */ conf.set("mapred.map.tasks", "" + maptasks); int reducetasks = 100; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); JobClient.runJob(conf);// w w w .j av a 2 s. c om }
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.SimCounts1.java
License:Apache License
@SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(DoubleSumReducer.class); conf.setReducerClass(DoubleSumReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); /* number of milliseconds before killing a not responding task */ //conf.set("mapred.task.timeout", "600000"); //conf.set("mapred.map.tasks.speculative.execution", "false"); /* change to 128mb */ //conf.set("dfs.block.size", "134217728"); /*//from w w w . j a v a 2 s.c o m * use compression */ /* conf.set("mapred.output.compress", "true"); conf.set("mapred.map.output.compress", "true"); conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); */ /* set the maximum number of task per node */ int maptasks = 100; /* Number of map tasks to deploy on each machine. 0.5 to 2 * (cores/node) */ conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); /* The default number of map tasks per job. Typically set to a prime several times greater than number of available hosts. */ conf.set("mapred.map.tasks", "" + maptasks); int reducetasks = 80; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); /* * how much virtual memory the entire process tree of each map/reduce * task will use */ conf.set("mapred.job.map.memory.mb", "4000"); conf.set("mapred.job.reduce.memory.mb", "4000"); conf.set("dfs.replication", "1"); /* * reduce I/O load */ conf.set("mapred.child.java.opts", "-Xmx1400M"); conf.set("io.sort.mb", "300"); conf.set("io.sort.factor", "30"); JobClient.runJob(conf); }
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.SimCounts1WithFeatures.java
License:Apache License
@SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); /* set the new defined type to be used */ conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); if (args.length > 3) { conf.setInt("threshold", Integer.parseInt(args[3])); }/* ww w .j a v a 2 s .c om*/ /* number of milliseconds before killing a not responding task */ conf.set("mapred.task.timeout", "600000"); /* change to 128mb */ conf.set("dfs.block.size", "134217728"); /* set the maximum number of task per node */ int maptasks = 200; /* * Number of map tasks to deploy on each machine. 0.5 to 2 * * (cores/node) */ conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); /* * The default number of map tasks per job. Typically set to a prime * several times greater than number of available hosts. */ conf.set("mapred.map.tasks", "" + maptasks); int reducetasks = 20; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); /* * how much virtual memory the entire process tree of each map/reduce * task will use */ conf.set("mapred.job.map.memory.mb", "4000"); conf.set("mapred.job.reduce.memory.mb", "4000"); conf.set("dfs.replication", "1"); /* * reduce I/O load */ conf.set("mapred.child.java.opts", "-Xmx1400M"); conf.set("io.sort.mb", "300"); conf.set("io.sort.factor", "30"); JobClient.runJob(conf); }
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.SimCountsLog.java
License:Apache License
/** * The reducer step will sum all float values, i.e. the * weight for any (word1,word2) pair sharing a feature. *///from w ww. ja va 2 s . c o m public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(FloatWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(DoubleSumReducer.class); conf.setReducerClass(DoubleSumReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); /* number of milliseconds before killing a not responding task */ conf.set("mapred.task.timeout", "600000"); /* change to 128mb */ conf.set("dfs.block.size", "134217728"); /* set the maximum number of task per node */ int maptasks = 100; /* Number of map tasks to deploy on each machine. 0.5 to 2 * (cores/node) */ conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); /* The default number of map tasks per job. Typically set to a prime several times greater than number of available hosts. */ conf.set("mapred.map.tasks", "" + maptasks); int reducetasks = 100; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); JobClient.runJob(conf); }