Example usage for org.apache.hadoop.mapred JobConf JobConf

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf JobConf.

Prototype

public JobConf(boolean loadDefaults)

Source Link

Document

A new map/reduce configuration where the behavior of reading from the default resources can be turned off.

Usage

From source file:com.digitalpebble.behemoth.util.CorpusFilter.java

License:Apache License

public int run(String[] args) throws Exception {

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("i", "input", true, "input Behemoth corpus");
    options.addOption("o", "output", true, "output Behemoth corpus");

    // parse the command line arguments
    CommandLine line = null;//from  w  w  w.j  a va  2 s .c o  m
    try {
        line = parser.parse(options, args);
        String input = line.getOptionValue("i");
        String output = line.getOptionValue("o");
        if (line.hasOption("help")) {
            formatter.printHelp("CorpusFilter", options);
            return 0;
        }
        if (input == null | output == null) {
            formatter.printHelp("CorpusFilter", options);
            return -1;
        }
    } catch (ParseException e) {
        formatter.printHelp("CorpusFilter", options);
    }

    final FileSystem fs = FileSystem.get(getConf());

    Path inputPath = new Path(line.getOptionValue("i"));
    Path outputPath = new Path(line.getOptionValue("o"));

    JobConf job = new JobConf(getConf());
    job.setJarByClass(this.getClass());

    job.setJobName("CorpusFilter : " + inputPath.toString());

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(BehemothDocument.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    boolean isFilterRequired = BehemothMapper.isRequired(job);
    // should be the case here
    if (!isFilterRequired) {
        System.err.println("No filters configured. Check your behemoth-site.xml");
        return -1;
    }
    job.setMapperClass(BehemothMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    try {
        JobClient.runJob(job);
    } catch (Exception e) {
        e.printStackTrace();
        fs.delete(outputPath, true);
    } finally {
    }

    return 0;
}

From source file:com.dynamicalsoftware.feed.mapreduce.AggregatePerformanceData.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length >= 2) {
        JobConf conf = new JobConf(AggregatePerformanceData.class);
        conf.setJobName("aggregate news feed performance data");
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);
        conf.setMapperClass(AggregatePerformanceData.Map.class);
        conf.setReducerClass(AggregatePerformanceData.Reduce.class);
        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);
        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));
        JobClient.runJob(conf);//from ww  w.j a  va  2 s.  co m
    } else {
        System.err.println("\nusage: AggregatePerformanceData input_directory output_directory\n");
    }
}

From source file:com.dynamicalsoftware.hadoop.mapreduce.SanFranciscoCrime.java

License:Apache License

private static void generate(String name, Class mapper, String input, String output) throws IOException {
    JobConf conf = new JobConf(SanFranciscoCrime.class);
    conf.setJobName(name);/*from  w  w w . j a  v a2  s.  c  om*/
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(mapper);
    conf.setReducerClass(ReduceByWeek.class);
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));
    JobClient.runJob(conf);
}

From source file:com.dynamicalsoftware.hadoop.mapreduce.SanFranciscoCrimePrepOlap.java

License:Apache License

/**
 * sets up and runs the hadoop map/reduce job itself
 * @param name contains the name of the job itself
 * @param mapper identified which mapper class to use
 * @param input is the fully qualified path to the raw crime data
 * @param output is the fully qualified path to where the generated data should reside
 * @throws IOException/*from w  ww  .  ja  va 2 s  . c om*/
 */
private static void generate(String name, Class mapper, String input, String output) throws IOException {
    JobConf conf = new JobConf(SanFranciscoCrimePrepOlap.class);
    conf.setJobName(name);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(mapper);
    conf.setReducerClass(Reduce.class);
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));
    JobClient.runJob(conf);
}

From source file:com.ebay.erl.mobius.core.builder.Dataset.java

License:Apache License

/**
 * Create a Hadoop JobConf that represents this dataset.
 * <p>//from   w ww  .j  a  v a 2  s .  c o m
 * 
 * This method is called by Mobius.
 */
public JobConf createJobConf(byte id) throws IOException {
    // preparing to create the new job, write the job conf

    this.id = id;

    if (this.tupleConstraint != null)
        this.conf.set(this.id + ".tuple.criteria", SerializableUtil.serializeToBase64(this.tupleConstraint));

    StringBuffer schemaStr = new StringBuffer();
    Iterator<String> it = this.getSchema().iterator();
    while (it.hasNext()) {
        schemaStr.append(it.next());
        if (it.hasNext())
            schemaStr.append(",");
    }
    this.conf.set(this.id + ".schema", schemaStr.toString());

    // setup computed columns, if any
    if (this.computedColumns != null && this.computedColumns.size() > 0) {
        this.conf.set(this.id + ".computed.columns", SerializableUtil.serializeToBase64(this.computedColumns));
    }

    // setup id to name mapping
    String mapping = this.getID() + ";" + this.getName();

    conf.set(ConfigureConstants.DATASET_ID_TO_NAME_MAPPING, mapping);

    return new JobConf(this.conf);
}

From source file:com.ebay.erl.mobius.core.GroupByConfigure.java

License:Apache License

/**
 * Specify the columns to be grouped by.
 * <p>//from   ww w  . j a v  a  2 s. c o  m
 * 
 * <code>columns</code> must all in the 
 * participated {@link Dataset}, the 
 * one specified in {@link MobiusJob#group(Dataset)}.
 */
public Persistable by(String... columns) throws IOException {
    if (columns == null || columns.length == 0) {
        throw new IllegalArgumentException("Please specify the columns to group by.");
    }

    Column[] projections = new Column[columns.length];
    for (int i = 0; i < columns.length; i++) {
        projections[i] = new Column(this.dataset, columns[i]);
    }

    // check if the specified columns are in the selected
    // dataset or not.
    JobSetup.validateColumns(this.dataset, projections);

    Byte datasetID = 0;

    // validation complete, set the key column
    Configuration aJobConf = this.dataset.createJobConf(datasetID);
    this.jobConf = Util.merge(this.jobConf, aJobConf);
    this.jobConf.set("mapred.job.name", "Group " + this.dataset.getName() + " by " + Arrays.toString(columns));
    this.jobConf.set(ConfigureConstants.MAPPER_CLASS, this.dataset.getMapper().getCanonicalName());

    String joinKeyPropertyName = datasetID + ".key.columns";

    for (Column aColumn : projections) {
        if (this.jobConf.get(joinKeyPropertyName, "").isEmpty()) {
            this.jobConf.set(joinKeyPropertyName, aColumn.getInputColumnName());
        } else {
            this.jobConf.set(joinKeyPropertyName,
                    this.jobConf.get(joinKeyPropertyName) + "," + aColumn.getInputColumnName());
        }
    }
    this.jobConf.set(ConfigureConstants.ALL_GROUP_KEY_COLUMNS, SerializableUtil.serializeToBase64(projections));
    return new Persistable(new JobConf(this.jobConf), this.dataset);
}

From source file:com.ebay.erl.mobius.core.JoinOnConfigure.java

License:Apache License

/**
 * Specify the joining columns from the dataset.
 * <p>//from  w w w.ja v  a 2  s.  c  o  m
 * 
 * Where there are more than one {@link EQ} in the
 * argument, they will be concatenated together
 * with AND.
 * <p>
 * 
 * Mobius only supports equal-join, ex: dataset1.column1=dataset2.column1.
 */
public Persistable on(EQ... eqs) throws IOException {
    if (eqs == null || eqs.length == 0) {
        throw new IllegalArgumentException("Please set at least one join key");
    }

    Set<Column> keyColumns = new HashSet<Column>();

    for (EQ anEQ : eqs) {
        for (Column aColumn : anEQ.columns) {
            this.setJoinKey(aColumn);
            keyColumns.add(aColumn);
        }
    }

    this.jobConf.set(ConfigureConstants.ALL_GROUP_KEY_COLUMNS,
            SerializableUtil.serializeToBase64(keyColumns.toArray(new Column[0])));
    StringBuffer involvedDSName = new StringBuffer();
    for (int i = 0; i < this.datasets.length; i++) {
        involvedDSName.append(this.datasets[i].getName());
        if (i < this.datasets.length - 1)
            involvedDSName.append(", ");
    }
    boolean isOuterJoin = this.jobConf.getBoolean(ConfigureConstants.IS_OUTER_JOIN, false);

    this.jobConf.set("mapred.job.name", (isOuterJoin ? "Outer Join " : "Inner Join ")
            + involvedDSName.toString() + " On " + Arrays.toString(eqs));

    return new Persistable(new JobConf(this.jobConf), this.datasets);
}

From source file:com.ebay.erl.mobius.core.Persistable.java

License:Apache License

Persistable(Configuration jobConf, Dataset... datasets) {
    this.jobConf = new JobConf(jobConf);
    this.datasets = datasets;
}

From source file:com.ebay.erl.mobius.core.Persistable.java

License:Apache License

/**
 * Save the dataset and store the <code>projections</code>
 * into a the specified <code>output</code> path in the 
 * format of the given <code>outputFormat</code>.
 * <p>/*w  w w  .  j  a  va2 s. co m*/
 * 
 * Only the rows that meet the <code>criteria</code> will be 
 * stored.  The <code>criteria</code> can only evaluate the 
 * columns specified in the <code>projections</code>.
 * <p>
 * 
 * <code>output</code> will be deleted before the job gets started.
 */
public Dataset save(MobiusJob job, Path output, Class<? extends FileOutputFormat> outputFormat,
        TupleCriterion criteria, Projectable... projections) throws IOException {
    if (projections == null || projections.length == 0)
        throw new IllegalArgumentException("Please specify the output columns.");

    // - VALIDATION - make sure no ambiguous column names.
    //
    // make sure the projections don't have two or more different columns that
    // have the same name but in different dataset, as we are going the use 
    // the {@link Column#getOutputColumnName} as the output schema of the
    // returned dataset.
    Set<String> columnNames = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
    for (Projectable aColumn : projections) {
        String[] outputSchema = aColumn.getOutputSchema();
        for (String anOutput : outputSchema) {
            if (!columnNames.contains(anOutput)) {
                columnNames.add(anOutput);
            } else {
                throw new IllegalArgumentException(columnNames + " from " + aColumn.toString()
                        + " is ambiguous, it has the same name"
                        + "as aother selected projected in different dataset, please use Column#setNewName(String) to"
                        + "change it.");
            }
        }
    }

    // - VALIDATION - if <code>criteria</code> is not null, need to make
    // sure the columns used in the criteria are in the output columns.
    if (criteria != null) {
        TupleCriterion.validate(columnNames, criteria);
        this.jobConf.set(ConfigureConstants.PERSISTANT_CRITERIA, SerializableUtil.serializeToBase64(criteria));
    }

    // setup {@link Dataset} to {@link Column} mapping so we can setup projection columns
    // for each dataset, and also perform validation on making sure all the projection columns 
    // are from the selected <code>datasets</code> only,
    Map<Dataset, List<Column>> datasetToColumns = new HashMap<Dataset, List<Column>>();

    for (Projectable aFunc : projections) {
        Column[] requiredInputColumns = aFunc.getInputColumns();
        for (Column aColumn : requiredInputColumns) {
            Dataset aDataset = aColumn.getDataset();
            // make sure the <code>aDataset</code> within the participated datasets
            boolean withinSelectedDataset = false;
            for (Dataset aSelectedDataset : this.datasets) {
                if (aSelectedDataset.equals(aDataset)) {
                    withinSelectedDataset = true;
                    break;
                }
            }

            if (!withinSelectedDataset) {
                // user select a column from a dataset that doesn't
                // in the selected datasets in this join/group by job.
                throw new IllegalArgumentException(aColumn.toString()
                        + " does not within the selected datasets "
                        + "in this join/group task, please select columns only from the selected datasets.");
            }

            List<Column> projectablesInADataset = null;
            if ((projectablesInADataset = datasetToColumns.get(aDataset)) == null) {
                projectablesInADataset = new LinkedList<Column>();
                datasetToColumns.put(aDataset, projectablesInADataset);
            }

            if (!projectablesInADataset.contains(aColumn))
                projectablesInADataset.add(aColumn);
        }
    }

    if (datasetToColumns.keySet().size() != this.datasets.length) {
        throw new IllegalArgumentException(
                "Please select at least one column from each dataset in the join/group-by job.");
    }

    // SETUP JOB
    if (this.userDefinedConf != null) {
        this.jobConf = new JobConf(Util.merge(this.jobConf, this.userDefinedConf));
    }
    this.jobConf.setJarByClass(job.getClass());
    this.jobConf.setMapOutputKeyClass(DataJoinKey.class);
    this.jobConf.setMapOutputValueClass(DataJoinValue.class);
    this.jobConf.setPartitionerClass(DataJoinKeyPartitioner.class);
    this.jobConf.setOutputValueGroupingComparator(DataJoinKey.Comparator.class);
    this.jobConf.setOutputKeyComparatorClass(DataJoinKey.class);
    this.jobConf.setReducerClass(DefaultMobiusReducer.class);
    this.jobConf.set(ConfigureConstants.PROJECTION_COLUMNS, SerializableUtil.serializeToBase64(projections));

    JobSetup.setupOutputs(this.jobConf, output, outputFormat);

    // setup input paths, projection columns for each datasets.
    for (byte assignedDatasetID = 0; assignedDatasetID < this.datasets.length; assignedDatasetID++) {
        Dataset aDataset = this.datasets[assignedDatasetID];

        // setup input for each dataset
        JobSetup.setupInputs(jobConf, aDataset, assignedDatasetID);

        // setup projection for each dataset
        JobSetup.setupProjections(jobConf, aDataset, assignedDatasetID,
                datasetToColumns.get(aDataset).toArray(new Column[0]));
    }

    // setup all dataset IDs
    for (int i = 0; i < this.datasets.length; i++) {
        Byte id = this.datasets[i].getID();
        if (!this.jobConf.get(ConfigureConstants.ALL_DATASET_IDS, "").isEmpty()) {
            this.jobConf.set(ConfigureConstants.ALL_DATASET_IDS,
                    this.jobConf.get(ConfigureConstants.ALL_DATASET_IDS) + "," + id);
        } else {
            this.jobConf.set(ConfigureConstants.ALL_DATASET_IDS, id.toString());
        }
    }

    boolean isCombinable = true;
    for (Projectable aFunc : projections) {
        aFunc.setConf(jobConf);

        if (!aFunc.isCombinable()) {
            isCombinable = false;
            LOGGER.info(aFunc.toString() + " is not combinable, #isCombinable() return false.");
            break;
        }
        if (aFunc instanceof GroupFunction && aFunc.useGroupKeyOnly()) {
            LOGGER.info(aFunc.toString()
                    + " is a group function and use group key as its input only, disable combiner.");
            isCombinable = false;
            break;
        }
    }

    LOGGER.info("Using Combiner? " + isCombinable);
    if (isCombinable) {
        jobConf.setCombinerClass(DefaultMobiusCombiner.class);
    }

    job.addToExecQueue(jobConf);

    AbstractDatasetBuilder builder = DatasetBuildersFactory.getInstance(job).getBuilder(outputFormat,
            "Dataset_" + output.getName());

    // form the output column from the projections
    List<String> outputColumns = new ArrayList<String>();
    for (Projectable func : projections) {
        String[] aProjectOutputs = func.getOutputSchema();
        for (String anOutputName : aProjectOutputs) {
            outputColumns.add(anOutputName);
        }
    }

    return builder.buildFromPreviousJob(jobConf, outputFormat, outputColumns.toArray(new String[0]));
}

From source file:com.ebay.erl.mobius.core.SortPersistable.java

License:Apache License

/**
 * Save the sort result to the given <code>output</code> with
 * the specified <code>outputFormat</code>.
 * <p>/*from   w  w  w  .j  a va 2s.c  o m*/
 * 
 * The returned {@link Dataset} represents the sorted result,
 * it can be used to do further analysis.
 */
public Dataset save(MobiusJob job, Path output, Class<? extends FileOutputFormat> outputFormat)
        throws IOException {
    // SETUP JOB
    if (this.userDefinedConf != null) {
        this.jobConf = new JobConf(Util.merge(this.jobConf, this.userDefinedConf));
    }
    this.jobConf.setJarByClass(job.getClass());
    this.jobConf.setMapOutputKeyClass(DataJoinKey.class);
    this.jobConf.setMapOutputValueClass(DataJoinValue.class);
    this.jobConf.setPartitionerClass(TotalOrderPartitioner.class);
    this.jobConf.setOutputKeyComparatorClass(DataJoinKey.class);
    this.jobConf.setReducerClass(TotalSortReducer.class);

    JobSetup.setupOutputs(this.jobConf, output, outputFormat);

    job.addToExecQueue(this.jobConf);

    AbstractDatasetBuilder builder = DatasetBuildersFactory.getInstance(job).getBuilder(outputFormat,
            "Dataset_" + output.getName());

    // form the output column from the projections
    List<String> outputColumns = new ArrayList<String>();
    for (Projectable func : projections) {
        String[] aProjectOutputs = func.getOutputSchema();
        for (String anOutputName : aProjectOutputs) {
            outputColumns.add(anOutputName);
        }
    }

    return builder.buildFromPreviousJob(jobConf, outputFormat, outputColumns.toArray(new String[0]));
}