List of usage examples for org.apache.hadoop.mapred JobConf setInputFormat
public void setInputFormat(Class<? extends InputFormat> theClass)
From source file:cascading.jdbc.db.DBInputFormat.java
License:Apache License
/** * Initializes the map-part of the job with the appropriate input settings. * * @param job The job/* w ww. j a va2s . c om*/ * @param inputClass the class object implementing DBWritable, which is the * Java object holding tuple fields. * @param tableName The table to read data from * @param conditions The condition which to select data with, eg. '(updated > * 20070101 AND length > 0)' * @param orderBy the fieldNames in the orderBy clause. * @param limit * @param fieldNames The field names in the table * @param concurrentReads */ public static void setInput(JobConf job, Class<? extends DBWritable> inputClass, String tableName, String conditions, String orderBy, long limit, int concurrentReads, String... fieldNames) { job.setInputFormat(DBInputFormat.class); DBConfiguration dbConf = new DBConfiguration(job); dbConf.setInputClass(inputClass); dbConf.setInputTableName(tableName); dbConf.setInputFieldNames(fieldNames); dbConf.setInputConditions(conditions); dbConf.setInputOrderBy(orderBy); if (limit != -1) dbConf.setInputLimit(limit); dbConf.setMaxConcurrentReadsNum(concurrentReads); }
From source file:cascading.jdbc.db.DBInputFormat.java
License:Apache License
/** * Initializes the map-part of the job with the appropriate input settings. * * @param job The job//from w w w . j a va 2 s . c om * @param inputClass the class object implementing DBWritable, which is the * Java object holding tuple fields. * @param selectQuery the input query to select fields. Example : * "SELECT f1, f2, f3 FROM Mytable ORDER BY f1" * @param countQuery the input query that returns the number of records in * the table. * Example : "SELECT COUNT(f1) FROM Mytable" * @param concurrentReads * @see #setInput(org.apache.hadoop.mapred.JobConf, Class, String, String, String, String...) */ public static void setInput(JobConf job, Class<? extends DBWritable> inputClass, String selectQuery, String countQuery, long limit, int concurrentReads) { job.setInputFormat(DBInputFormat.class); DBConfiguration dbConf = new DBConfiguration(job); dbConf.setInputClass(inputClass); dbConf.setInputQuery(selectQuery); dbConf.setInputCountQuery(countQuery); if (limit != -1) dbConf.setInputLimit(limit); dbConf.setMaxConcurrentReadsNum(concurrentReads); }
From source file:cascading.scheme.SequenceFile.java
License:Open Source License
@Override public void sourceInit(Tap tap, JobConf conf) { conf.setInputFormat(SequenceFileInputFormat.class); }
From source file:cascading.scheme.TextLine.java
License:Open Source License
@Override public void sourceInit(Tap tap, JobConf conf) { if (hasZippedFiles(FileInputFormat.getInputPaths(conf))) conf.setInputFormat(ZipInputFormat.class); else/* ww w. ja v a2s. c o m*/ conf.setInputFormat(TextInputFormat.class); }
From source file:cascading.tap.hadoop.io.MultiInputFormat.java
License:Open Source License
/** * Used to set the current JobConf with all sub jobs configurations. * * @param toJob/*from w w w . j a va 2s. c om*/ * @param fromJobs */ public static void addInputFormat(JobConf toJob, JobConf... fromJobs) { toJob.setInputFormat(MultiInputFormat.class); List<Map<String, String>> configs = new ArrayList<Map<String, String>>(); List<Path> allPaths = new ArrayList<Path>(); boolean isLocal = false; for (JobConf fromJob : fromJobs) { if (fromJob.get("mapred.input.format.class") == null) throw new CascadingException( "mapred.input.format.class is required, should be set in source Scheme#sourceConfInit"); configs.add(HadoopUtil.getConfig(toJob, fromJob)); Collections.addAll(allPaths, FileInputFormat.getInputPaths(fromJob)); if (!isLocal) isLocal = HadoopUtil.isLocal(fromJob); } if (!allPaths.isEmpty()) // it's possible there aren't any FileInputFormat.setInputPaths(toJob, (Path[]) allPaths.toArray(new Path[allPaths.size()])); try { toJob.set("cascading.multiinputformats", HadoopUtil.serializeBase64(configs, toJob, true)); } catch (IOException exception) { throw new CascadingException("unable to pack input formats", exception); } if (isLocal) HadoopUtil.setLocal(toJob); }
From source file:cascading.tap.hadoop.MultiInputFormat.java
License:Open Source License
/** * Used to set the current JobConf with all sub jobs configurations. * * @param toJob//w w w. ja v a 2 s .c o m * @param fromJobs */ public static void addInputFormat(JobConf toJob, JobConf... fromJobs) { toJob.setInputFormat(MultiInputFormat.class); List<Map<String, String>> configs = new ArrayList<Map<String, String>>(); List<Path> allPaths = new ArrayList<Path>(); boolean isLocal = false; for (JobConf fromJob : fromJobs) { configs.add(getConfig(toJob, fromJob)); Collections.addAll(allPaths, FileInputFormat.getInputPaths(fromJob)); if (!isLocal) isLocal = fromJob.get("mapred.job.tracker").equalsIgnoreCase("local"); } FileInputFormat.setInputPaths(toJob, (Path[]) allPaths.toArray(new Path[allPaths.size()])); try { toJob.set("cascading.multiinputformats", Util.serializeBase64(configs)); } catch (IOException exception) { throw new CascadingException("unable to pack input formats", exception); } if (isLocal) toJob.set("mapred.job.tracker", "local"); }
From source file:clusteringblocks.ClusteringBlocks.java
public int run(String[] args) throws Exception { Configuration conf = getConf(); JobConf job = new JobConf(conf, ClusteringBlocks.class); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); job.setJobName("ClusteringBlocks"); job.setMapperClass(MapClass.class); job.setReducerClass(Reduce.class); job.setInputFormat(KeyValueTextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // job.set("key.value.separator.in.input.line", ""); JobClient.runJob(job);/*from w w w . ja v a 2 s . c om*/ return 0; }
From source file:cn.edu.hfut.dmic.webcollectorcluster.fetcher.Fetcher.java
@Override public int run(String[] args) throws Exception { JobConf jc = new JobConf(getConf()); jc.setJarByClass(Fetcher.class); jc.setInputFormat(SequenceFileInputFormat.class); Path input = new Path(args[0], "current"); Path output = new Path(args[1]); Configuration conf = CrawlerConfiguration.create(); FileSystem fs = output.getFileSystem(conf); if (fs.exists(output)) { fs.delete(output);//from w w w . j a v a2 s . co m } FileInputFormat.addInputPath(jc, input); FileOutputFormat.setOutputPath(jc, output); jc.setMapOutputKeyClass(Text.class); jc.setMapOutputValueClass(WebWritable.class); jc.setMapRunnerClass(Fetcher.class); jc.setOutputFormat(FetcherOutputFormat.class); JobClient.runJob(jc); return 0; }
From source file:cn.edu.xmu.dm.mapreduce.MultiFileWordCount.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 2) { printUsage();/*www . j a v a2 s . c o m*/ return 1; } JobConf job = new JobConf(getConf(), MultiFileWordCount.class); job.setJobName("MultiFileWordCount"); // set the InputFormat of the job to our InputFormat job.setInputFormat(MyInputFormat.class); // the keys are words (strings) job.setOutputKeyClass(Text.class); // the values are counts (ints) job.setOutputValueClass(LongWritable.class); // use the defined mapper job.setMapperClass(MapClass.class); // use the WordCount Reducer job.setCombinerClass(LongSumReducer.class); job.setReducerClass(LongSumReducer.class); FileInputFormat.addInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); JobClient.runJob(job); return 0; }
From source file:cn.edu.xmu.dm.mapreduce.Sort.java
License:Apache License
/** * The main driver for sort program. Invoke this method to submit the * map/reduce job./*from w ww .j av a 2s . c o m*/ * * @throws IOException * When there is communication problems with the job tracker. */ public int run(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "Sorter"); job.setJarByClass(Sort.class); JobConf jobConf = new JobConf(getConf(), Sort.class); jobConf.setJobName("sorter"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sort_reduces = jobConf.get("test.sort.reduces_per_host"); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class; Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = BytesWritable.class; Class<? extends Writable> outputValueClass = BytesWritable.class; List<String> otherArgs = new ArrayList<String>(); InputSampler.Sampler<K, V> sampler = null; for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { jobConf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { num_reduces = Integer.parseInt(args[++i]); } else if ("-inFormat".equals(args[i])) { inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class); } else if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else if ("-outKey".equals(args[i])) { outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class); } else if ("-outValue".equals(args[i])) { outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class); } else if ("-totalOrder".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(num_reduces); jobConf.setInputFormat(inputFormatClass); jobConf.setOutputFormat(outputFormatClass); jobConf.setOutputKeyClass(outputKeyClass); jobConf.setOutputValueClass(outputValueClass); // Make sure there are exactly 2 parameters left. if (otherArgs.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(jobConf, otherArgs.get(0)); FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1))); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); jobConf.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(jobConf)[0]; inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile); InputSampler.<K, V>writePartitionFile(jobConf, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }