List of usage examples for org.apache.hadoop.mapred JobConf JobConf
public JobConf(boolean loadDefaults)
From source file:com.digitalpebble.behemoth.util.CorpusFilter.java
License:Apache License
public int run(String[] args) throws Exception { Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("i", "input", true, "input Behemoth corpus"); options.addOption("o", "output", true, "output Behemoth corpus"); // parse the command line arguments CommandLine line = null;//from w w w.j a va 2 s .c o m try { line = parser.parse(options, args); String input = line.getOptionValue("i"); String output = line.getOptionValue("o"); if (line.hasOption("help")) { formatter.printHelp("CorpusFilter", options); return 0; } if (input == null | output == null) { formatter.printHelp("CorpusFilter", options); return -1; } } catch (ParseException e) { formatter.printHelp("CorpusFilter", options); } final FileSystem fs = FileSystem.get(getConf()); Path inputPath = new Path(line.getOptionValue("i")); Path outputPath = new Path(line.getOptionValue("o")); JobConf job = new JobConf(getConf()); job.setJarByClass(this.getClass()); job.setJobName("CorpusFilter : " + inputPath.toString()); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BehemothDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); boolean isFilterRequired = BehemothMapper.isRequired(job); // should be the case here if (!isFilterRequired) { System.err.println("No filters configured. Check your behemoth-site.xml"); return -1; } job.setMapperClass(BehemothMapper.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); try { JobClient.runJob(job); } catch (Exception e) { e.printStackTrace(); fs.delete(outputPath, true); } finally { } return 0; }
From source file:com.dynamicalsoftware.feed.mapreduce.AggregatePerformanceData.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length >= 2) { JobConf conf = new JobConf(AggregatePerformanceData.class); conf.setJobName("aggregate news feed performance data"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(AggregatePerformanceData.Map.class); conf.setReducerClass(AggregatePerformanceData.Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf);//from ww w.j a va 2 s. co m } else { System.err.println("\nusage: AggregatePerformanceData input_directory output_directory\n"); } }
From source file:com.dynamicalsoftware.hadoop.mapreduce.SanFranciscoCrime.java
License:Apache License
private static void generate(String name, Class mapper, String input, String output) throws IOException { JobConf conf = new JobConf(SanFranciscoCrime.class); conf.setJobName(name);/*from w w w . j a v a2 s. c om*/ conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(mapper); conf.setReducerClass(ReduceByWeek.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); JobClient.runJob(conf); }
From source file:com.dynamicalsoftware.hadoop.mapreduce.SanFranciscoCrimePrepOlap.java
License:Apache License
/** * sets up and runs the hadoop map/reduce job itself * @param name contains the name of the job itself * @param mapper identified which mapper class to use * @param input is the fully qualified path to the raw crime data * @param output is the fully qualified path to where the generated data should reside * @throws IOException/*from w ww . ja va 2 s . c om*/ */ private static void generate(String name, Class mapper, String input, String output) throws IOException { JobConf conf = new JobConf(SanFranciscoCrimePrepOlap.class); conf.setJobName(name); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(mapper); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); JobClient.runJob(conf); }
From source file:com.ebay.erl.mobius.core.builder.Dataset.java
License:Apache License
/** * Create a Hadoop JobConf that represents this dataset. * <p>//from w ww .j a v a 2 s . c o m * * This method is called by Mobius. */ public JobConf createJobConf(byte id) throws IOException { // preparing to create the new job, write the job conf this.id = id; if (this.tupleConstraint != null) this.conf.set(this.id + ".tuple.criteria", SerializableUtil.serializeToBase64(this.tupleConstraint)); StringBuffer schemaStr = new StringBuffer(); Iterator<String> it = this.getSchema().iterator(); while (it.hasNext()) { schemaStr.append(it.next()); if (it.hasNext()) schemaStr.append(","); } this.conf.set(this.id + ".schema", schemaStr.toString()); // setup computed columns, if any if (this.computedColumns != null && this.computedColumns.size() > 0) { this.conf.set(this.id + ".computed.columns", SerializableUtil.serializeToBase64(this.computedColumns)); } // setup id to name mapping String mapping = this.getID() + ";" + this.getName(); conf.set(ConfigureConstants.DATASET_ID_TO_NAME_MAPPING, mapping); return new JobConf(this.conf); }
From source file:com.ebay.erl.mobius.core.GroupByConfigure.java
License:Apache License
/** * Specify the columns to be grouped by. * <p>//from ww w . j a v a 2 s. c o m * * <code>columns</code> must all in the * participated {@link Dataset}, the * one specified in {@link MobiusJob#group(Dataset)}. */ public Persistable by(String... columns) throws IOException { if (columns == null || columns.length == 0) { throw new IllegalArgumentException("Please specify the columns to group by."); } Column[] projections = new Column[columns.length]; for (int i = 0; i < columns.length; i++) { projections[i] = new Column(this.dataset, columns[i]); } // check if the specified columns are in the selected // dataset or not. JobSetup.validateColumns(this.dataset, projections); Byte datasetID = 0; // validation complete, set the key column Configuration aJobConf = this.dataset.createJobConf(datasetID); this.jobConf = Util.merge(this.jobConf, aJobConf); this.jobConf.set("mapred.job.name", "Group " + this.dataset.getName() + " by " + Arrays.toString(columns)); this.jobConf.set(ConfigureConstants.MAPPER_CLASS, this.dataset.getMapper().getCanonicalName()); String joinKeyPropertyName = datasetID + ".key.columns"; for (Column aColumn : projections) { if (this.jobConf.get(joinKeyPropertyName, "").isEmpty()) { this.jobConf.set(joinKeyPropertyName, aColumn.getInputColumnName()); } else { this.jobConf.set(joinKeyPropertyName, this.jobConf.get(joinKeyPropertyName) + "," + aColumn.getInputColumnName()); } } this.jobConf.set(ConfigureConstants.ALL_GROUP_KEY_COLUMNS, SerializableUtil.serializeToBase64(projections)); return new Persistable(new JobConf(this.jobConf), this.dataset); }
From source file:com.ebay.erl.mobius.core.JoinOnConfigure.java
License:Apache License
/** * Specify the joining columns from the dataset. * <p>//from w w w.ja v a 2 s. c o m * * Where there are more than one {@link EQ} in the * argument, they will be concatenated together * with AND. * <p> * * Mobius only supports equal-join, ex: dataset1.column1=dataset2.column1. */ public Persistable on(EQ... eqs) throws IOException { if (eqs == null || eqs.length == 0) { throw new IllegalArgumentException("Please set at least one join key"); } Set<Column> keyColumns = new HashSet<Column>(); for (EQ anEQ : eqs) { for (Column aColumn : anEQ.columns) { this.setJoinKey(aColumn); keyColumns.add(aColumn); } } this.jobConf.set(ConfigureConstants.ALL_GROUP_KEY_COLUMNS, SerializableUtil.serializeToBase64(keyColumns.toArray(new Column[0]))); StringBuffer involvedDSName = new StringBuffer(); for (int i = 0; i < this.datasets.length; i++) { involvedDSName.append(this.datasets[i].getName()); if (i < this.datasets.length - 1) involvedDSName.append(", "); } boolean isOuterJoin = this.jobConf.getBoolean(ConfigureConstants.IS_OUTER_JOIN, false); this.jobConf.set("mapred.job.name", (isOuterJoin ? "Outer Join " : "Inner Join ") + involvedDSName.toString() + " On " + Arrays.toString(eqs)); return new Persistable(new JobConf(this.jobConf), this.datasets); }
From source file:com.ebay.erl.mobius.core.Persistable.java
License:Apache License
Persistable(Configuration jobConf, Dataset... datasets) { this.jobConf = new JobConf(jobConf); this.datasets = datasets; }
From source file:com.ebay.erl.mobius.core.Persistable.java
License:Apache License
/** * Save the dataset and store the <code>projections</code> * into a the specified <code>output</code> path in the * format of the given <code>outputFormat</code>. * <p>/*w w w . j a va2 s. co m*/ * * Only the rows that meet the <code>criteria</code> will be * stored. The <code>criteria</code> can only evaluate the * columns specified in the <code>projections</code>. * <p> * * <code>output</code> will be deleted before the job gets started. */ public Dataset save(MobiusJob job, Path output, Class<? extends FileOutputFormat> outputFormat, TupleCriterion criteria, Projectable... projections) throws IOException { if (projections == null || projections.length == 0) throw new IllegalArgumentException("Please specify the output columns."); // - VALIDATION - make sure no ambiguous column names. // // make sure the projections don't have two or more different columns that // have the same name but in different dataset, as we are going the use // the {@link Column#getOutputColumnName} as the output schema of the // returned dataset. Set<String> columnNames = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER); for (Projectable aColumn : projections) { String[] outputSchema = aColumn.getOutputSchema(); for (String anOutput : outputSchema) { if (!columnNames.contains(anOutput)) { columnNames.add(anOutput); } else { throw new IllegalArgumentException(columnNames + " from " + aColumn.toString() + " is ambiguous, it has the same name" + "as aother selected projected in different dataset, please use Column#setNewName(String) to" + "change it."); } } } // - VALIDATION - if <code>criteria</code> is not null, need to make // sure the columns used in the criteria are in the output columns. if (criteria != null) { TupleCriterion.validate(columnNames, criteria); this.jobConf.set(ConfigureConstants.PERSISTANT_CRITERIA, SerializableUtil.serializeToBase64(criteria)); } // setup {@link Dataset} to {@link Column} mapping so we can setup projection columns // for each dataset, and also perform validation on making sure all the projection columns // are from the selected <code>datasets</code> only, Map<Dataset, List<Column>> datasetToColumns = new HashMap<Dataset, List<Column>>(); for (Projectable aFunc : projections) { Column[] requiredInputColumns = aFunc.getInputColumns(); for (Column aColumn : requiredInputColumns) { Dataset aDataset = aColumn.getDataset(); // make sure the <code>aDataset</code> within the participated datasets boolean withinSelectedDataset = false; for (Dataset aSelectedDataset : this.datasets) { if (aSelectedDataset.equals(aDataset)) { withinSelectedDataset = true; break; } } if (!withinSelectedDataset) { // user select a column from a dataset that doesn't // in the selected datasets in this join/group by job. throw new IllegalArgumentException(aColumn.toString() + " does not within the selected datasets " + "in this join/group task, please select columns only from the selected datasets."); } List<Column> projectablesInADataset = null; if ((projectablesInADataset = datasetToColumns.get(aDataset)) == null) { projectablesInADataset = new LinkedList<Column>(); datasetToColumns.put(aDataset, projectablesInADataset); } if (!projectablesInADataset.contains(aColumn)) projectablesInADataset.add(aColumn); } } if (datasetToColumns.keySet().size() != this.datasets.length) { throw new IllegalArgumentException( "Please select at least one column from each dataset in the join/group-by job."); } // SETUP JOB if (this.userDefinedConf != null) { this.jobConf = new JobConf(Util.merge(this.jobConf, this.userDefinedConf)); } this.jobConf.setJarByClass(job.getClass()); this.jobConf.setMapOutputKeyClass(DataJoinKey.class); this.jobConf.setMapOutputValueClass(DataJoinValue.class); this.jobConf.setPartitionerClass(DataJoinKeyPartitioner.class); this.jobConf.setOutputValueGroupingComparator(DataJoinKey.Comparator.class); this.jobConf.setOutputKeyComparatorClass(DataJoinKey.class); this.jobConf.setReducerClass(DefaultMobiusReducer.class); this.jobConf.set(ConfigureConstants.PROJECTION_COLUMNS, SerializableUtil.serializeToBase64(projections)); JobSetup.setupOutputs(this.jobConf, output, outputFormat); // setup input paths, projection columns for each datasets. for (byte assignedDatasetID = 0; assignedDatasetID < this.datasets.length; assignedDatasetID++) { Dataset aDataset = this.datasets[assignedDatasetID]; // setup input for each dataset JobSetup.setupInputs(jobConf, aDataset, assignedDatasetID); // setup projection for each dataset JobSetup.setupProjections(jobConf, aDataset, assignedDatasetID, datasetToColumns.get(aDataset).toArray(new Column[0])); } // setup all dataset IDs for (int i = 0; i < this.datasets.length; i++) { Byte id = this.datasets[i].getID(); if (!this.jobConf.get(ConfigureConstants.ALL_DATASET_IDS, "").isEmpty()) { this.jobConf.set(ConfigureConstants.ALL_DATASET_IDS, this.jobConf.get(ConfigureConstants.ALL_DATASET_IDS) + "," + id); } else { this.jobConf.set(ConfigureConstants.ALL_DATASET_IDS, id.toString()); } } boolean isCombinable = true; for (Projectable aFunc : projections) { aFunc.setConf(jobConf); if (!aFunc.isCombinable()) { isCombinable = false; LOGGER.info(aFunc.toString() + " is not combinable, #isCombinable() return false."); break; } if (aFunc instanceof GroupFunction && aFunc.useGroupKeyOnly()) { LOGGER.info(aFunc.toString() + " is a group function and use group key as its input only, disable combiner."); isCombinable = false; break; } } LOGGER.info("Using Combiner? " + isCombinable); if (isCombinable) { jobConf.setCombinerClass(DefaultMobiusCombiner.class); } job.addToExecQueue(jobConf); AbstractDatasetBuilder builder = DatasetBuildersFactory.getInstance(job).getBuilder(outputFormat, "Dataset_" + output.getName()); // form the output column from the projections List<String> outputColumns = new ArrayList<String>(); for (Projectable func : projections) { String[] aProjectOutputs = func.getOutputSchema(); for (String anOutputName : aProjectOutputs) { outputColumns.add(anOutputName); } } return builder.buildFromPreviousJob(jobConf, outputFormat, outputColumns.toArray(new String[0])); }
From source file:com.ebay.erl.mobius.core.SortPersistable.java
License:Apache License
/** * Save the sort result to the given <code>output</code> with * the specified <code>outputFormat</code>. * <p>/*from w w w .j a va 2s.c o m*/ * * The returned {@link Dataset} represents the sorted result, * it can be used to do further analysis. */ public Dataset save(MobiusJob job, Path output, Class<? extends FileOutputFormat> outputFormat) throws IOException { // SETUP JOB if (this.userDefinedConf != null) { this.jobConf = new JobConf(Util.merge(this.jobConf, this.userDefinedConf)); } this.jobConf.setJarByClass(job.getClass()); this.jobConf.setMapOutputKeyClass(DataJoinKey.class); this.jobConf.setMapOutputValueClass(DataJoinValue.class); this.jobConf.setPartitionerClass(TotalOrderPartitioner.class); this.jobConf.setOutputKeyComparatorClass(DataJoinKey.class); this.jobConf.setReducerClass(TotalSortReducer.class); JobSetup.setupOutputs(this.jobConf, output, outputFormat); job.addToExecQueue(this.jobConf); AbstractDatasetBuilder builder = DatasetBuildersFactory.getInstance(job).getBuilder(outputFormat, "Dataset_" + output.getName()); // form the output column from the projections List<String> outputColumns = new ArrayList<String>(); for (Projectable func : projections) { String[] aProjectOutputs = func.getOutputSchema(); for (String anOutputName : aProjectOutputs) { outputColumns.add(anOutputName); } } return builder.buildFromPreviousJob(jobConf, outputFormat, outputColumns.toArray(new String[0])); }