Example usage for org.apache.hadoop.mapred JobConf JobConf

List of usage examples for org.apache.hadoop.mapred JobConf JobConf

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf JobConf.

Prototype

public JobConf(boolean loadDefaults) 

Source Link

Document

A new map/reduce configuration where the behavior of reading from the default resources can be turned off.

Usage

From source file:com.digitalpebble.behemoth.util.CorpusFilter.java

License:Apache License

public int run(String[] args) throws Exception {

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("i", "input", true, "input Behemoth corpus");
    options.addOption("o", "output", true, "output Behemoth corpus");

    // parse the command line arguments
    CommandLine line = null;//from  w  w  w.j  a va  2 s .c o  m
    try {
        line = parser.parse(options, args);
        String input = line.getOptionValue("i");
        String output = line.getOptionValue("o");
        if (line.hasOption("help")) {
            formatter.printHelp("CorpusFilter", options);
            return 0;
        }
        if (input == null | output == null) {
            formatter.printHelp("CorpusFilter", options);
            return -1;
        }
    } catch (ParseException e) {
        formatter.printHelp("CorpusFilter", options);
    }

    final FileSystem fs = FileSystem.get(getConf());

    Path inputPath = new Path(line.getOptionValue("i"));
    Path outputPath = new Path(line.getOptionValue("o"));

    JobConf job = new JobConf(getConf());
    job.setJarByClass(this.getClass());

    job.setJobName("CorpusFilter : " + inputPath.toString());

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(BehemothDocument.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    boolean isFilterRequired = BehemothMapper.isRequired(job);
    // should be the case here
    if (!isFilterRequired) {
        System.err.println("No filters configured. Check your behemoth-site.xml");
        return -1;
    }
    job.setMapperClass(BehemothMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    try {
        JobClient.runJob(job);
    } catch (Exception e) {
        e.printStackTrace();
        fs.delete(outputPath, true);
    } finally {
    }

    return 0;
}

From source file:com.dynamicalsoftware.feed.mapreduce.AggregatePerformanceData.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length >= 2) {
        JobConf conf = new JobConf(AggregatePerformanceData.class);
        conf.setJobName("aggregate news feed performance data");
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);
        conf.setMapperClass(AggregatePerformanceData.Map.class);
        conf.setReducerClass(AggregatePerformanceData.Reduce.class);
        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);
        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));
        JobClient.runJob(conf);//from ww  w.j a  va  2 s.  co m
    } else {
        System.err.println("\nusage: AggregatePerformanceData input_directory output_directory\n");
    }
}

From source file:com.dynamicalsoftware.hadoop.mapreduce.SanFranciscoCrime.java

License:Apache License

private static void generate(String name, Class mapper, String input, String output) throws IOException {
    JobConf conf = new JobConf(SanFranciscoCrime.class);
    conf.setJobName(name);/*from  w  w w . j a  v a2  s.  c  om*/
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(mapper);
    conf.setReducerClass(ReduceByWeek.class);
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));
    JobClient.runJob(conf);
}

From source file:com.dynamicalsoftware.hadoop.mapreduce.SanFranciscoCrimePrepOlap.java

License:Apache License

/**
 * sets up and runs the hadoop map/reduce job itself
 * @param name contains the name of the job itself
 * @param mapper identified which mapper class to use
 * @param input is the fully qualified path to the raw crime data
 * @param output is the fully qualified path to where the generated data should reside
 * @throws IOException/*from w  ww  .  ja  va 2 s  . c om*/
 */
private static void generate(String name, Class mapper, String input, String output) throws IOException {
    JobConf conf = new JobConf(SanFranciscoCrimePrepOlap.class);
    conf.setJobName(name);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(mapper);
    conf.setReducerClass(Reduce.class);
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));
    JobClient.runJob(conf);
}

From source file:com.ebay.erl.mobius.core.builder.Dataset.java

License:Apache License

/**
 * Create a Hadoop JobConf that represents this dataset.
 * <p>//from   w ww  .j  a  v a 2  s .  c o m
 * 
 * This method is called by Mobius.
 */
public JobConf createJobConf(byte id) throws IOException {
    // preparing to create the new job, write the job conf

    this.id = id;

    if (this.tupleConstraint != null)
        this.conf.set(this.id + ".tuple.criteria", SerializableUtil.serializeToBase64(this.tupleConstraint));

    StringBuffer schemaStr = new StringBuffer();
    Iterator<String> it = this.getSchema().iterator();
    while (it.hasNext()) {
        schemaStr.append(it.next());
        if (it.hasNext())
            schemaStr.append(",");
    }
    this.conf.set(this.id + ".schema", schemaStr.toString());

    // setup computed columns, if any
    if (this.computedColumns != null && this.computedColumns.size() > 0) {
        this.conf.set(this.id + ".computed.columns", SerializableUtil.serializeToBase64(this.computedColumns));
    }

    // setup id to name mapping
    String mapping = this.getID() + ";" + this.getName();

    conf.set(ConfigureConstants.DATASET_ID_TO_NAME_MAPPING, mapping);

    return new JobConf(this.conf);
}

From source file:com.ebay.erl.mobius.core.GroupByConfigure.java

License:Apache License

/**
 * Specify the columns to be grouped by.
 * <p>//from   ww w  . j a v  a  2 s. c o  m
 * 
 * <code>columns</code> must all in the 
 * participated {@link Dataset}, the 
 * one specified in {@link MobiusJob#group(Dataset)}.
 */
public Persistable by(String... columns) throws IOException {
    if (columns == null || columns.length == 0) {
        throw new IllegalArgumentException("Please specify the columns to group by.");
    }

    Column[] projections = new Column[columns.length];
    for (int i = 0; i < columns.length; i++) {
        projections[i] = new Column(this.dataset, columns[i]);
    }

    // check if the specified columns are in the selected
    // dataset or not.
    JobSetup.validateColumns(this.dataset, projections);

    Byte datasetID = 0;

    // validation complete, set the key column
    Configuration aJobConf = this.dataset.createJobConf(datasetID);
    this.jobConf = Util.merge(this.jobConf, aJobConf);
    this.jobConf.set("mapred.job.name", "Group " + this.dataset.getName() + " by " + Arrays.toString(columns));
    this.jobConf.set(ConfigureConstants.MAPPER_CLASS, this.dataset.getMapper().getCanonicalName());

    String joinKeyPropertyName = datasetID + ".key.columns";

    for (Column aColumn : projections) {
        if (this.jobConf.get(joinKeyPropertyName, "").isEmpty()) {
            this.jobConf.set(joinKeyPropertyName, aColumn.getInputColumnName());
        } else {
            this.jobConf.set(joinKeyPropertyName,
                    this.jobConf.get(joinKeyPropertyName) + "," + aColumn.getInputColumnName());
        }
    }
    this.jobConf.set(ConfigureConstants.ALL_GROUP_KEY_COLUMNS, SerializableUtil.serializeToBase64(projections));
    return new Persistable(new JobConf(this.jobConf), this.dataset);
}

From source file:com.ebay.erl.mobius.core.JoinOnConfigure.java

License:Apache License

/**
 * Specify the joining columns from the dataset.
 * <p>//from  w w w.ja v  a 2  s.  c  o  m
 * 
 * Where there are more than one {@link EQ} in the
 * argument, they will be concatenated together
 * with AND.
 * <p>
 * 
 * Mobius only supports equal-join, ex: dataset1.column1=dataset2.column1.
 */
public Persistable on(EQ... eqs) throws IOException {
    if (eqs == null || eqs.length == 0) {
        throw new IllegalArgumentException("Please set at least one join key");
    }

    Set<Column> keyColumns = new HashSet<Column>();

    for (EQ anEQ : eqs) {
        for (Column aColumn : anEQ.columns) {
            this.setJoinKey(aColumn);
            keyColumns.add(aColumn);
        }
    }

    this.jobConf.set(ConfigureConstants.ALL_GROUP_KEY_COLUMNS,
            SerializableUtil.serializeToBase64(keyColumns.toArray(new Column[0])));
    StringBuffer involvedDSName = new StringBuffer();
    for (int i = 0; i < this.datasets.length; i++) {
        involvedDSName.append(this.datasets[i].getName());
        if (i < this.datasets.length - 1)
            involvedDSName.append(", ");
    }
    boolean isOuterJoin = this.jobConf.getBoolean(ConfigureConstants.IS_OUTER_JOIN, false);

    this.jobConf.set("mapred.job.name", (isOuterJoin ? "Outer Join " : "Inner Join ")
            + involvedDSName.toString() + " On " + Arrays.toString(eqs));

    return new Persistable(new JobConf(this.jobConf), this.datasets);
}

From source file:com.ebay.erl.mobius.core.Persistable.java

License:Apache License

Persistable(Configuration jobConf, Dataset... datasets) {
    this.jobConf = new JobConf(jobConf);
    this.datasets = datasets;
}

From source file:com.ebay.erl.mobius.core.Persistable.java

License:Apache License

/**
 * Save the dataset and store the <code>projections</code>
 * into a the specified <code>output</code> path in the 
 * format of the given <code>outputFormat</code>.
 * <p>/*w  w w  .  j  a  va2 s. co m*/
 * 
 * Only the rows that meet the <code>criteria</code> will be 
 * stored.  The <code>criteria</code> can only evaluate the 
 * columns specified in the <code>projections</code>.
 * <p>
 * 
 * <code>output</code> will be deleted before the job gets started.
 */
public Dataset save(MobiusJob job, Path output, Class<? extends FileOutputFormat> outputFormat,
        TupleCriterion criteria, Projectable... projections) throws IOException {
    if (projections == null || projections.length == 0)
        throw new IllegalArgumentException("Please specify the output columns.");

    // - VALIDATION - make sure no ambiguous column names.
    //
    // make sure the projections don't have two or more different columns that
    // have the same name but in different dataset, as we are going the use 
    // the {@link Column#getOutputColumnName} as the output schema of the
    // returned dataset.
    Set<String> columnNames = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
    for (Projectable aColumn : projections) {
        String[] outputSchema = aColumn.getOutputSchema();
        for (String anOutput : outputSchema) {
            if (!columnNames.contains(anOutput)) {
                columnNames.add(anOutput);
            } else {
                throw new IllegalArgumentException(columnNames + " from " + aColumn.toString()
                        + " is ambiguous, it has the same name"
                        + "as aother selected projected in different dataset, please use Column#setNewName(String) to"
                        + "change it.");
            }
        }
    }

    // - VALIDATION - if <code>criteria</code> is not null, need to make
    // sure the columns used in the criteria are in the output columns.
    if (criteria != null) {
        TupleCriterion.validate(columnNames, criteria);
        this.jobConf.set(ConfigureConstants.PERSISTANT_CRITERIA, SerializableUtil.serializeToBase64(criteria));
    }

    // setup {@link Dataset} to {@link Column} mapping so we can setup projection columns
    // for each dataset, and also perform validation on making sure all the projection columns 
    // are from the selected <code>datasets</code> only,
    Map<Dataset, List<Column>> datasetToColumns = new HashMap<Dataset, List<Column>>();

    for (Projectable aFunc : projections) {
        Column[] requiredInputColumns = aFunc.getInputColumns();
        for (Column aColumn : requiredInputColumns) {
            Dataset aDataset = aColumn.getDataset();
            // make sure the <code>aDataset</code> within the participated datasets
            boolean withinSelectedDataset = false;
            for (Dataset aSelectedDataset : this.datasets) {
                if (aSelectedDataset.equals(aDataset)) {
                    withinSelectedDataset = true;
                    break;
                }
            }

            if (!withinSelectedDataset) {
                // user select a column from a dataset that doesn't
                // in the selected datasets in this join/group by job.
                throw new IllegalArgumentException(aColumn.toString()
                        + " does not within the selected datasets "
                        + "in this join/group task, please select columns only from the selected datasets.");
            }

            List<Column> projectablesInADataset = null;
            if ((projectablesInADataset = datasetToColumns.get(aDataset)) == null) {
                projectablesInADataset = new LinkedList<Column>();
                datasetToColumns.put(aDataset, projectablesInADataset);
            }

            if (!projectablesInADataset.contains(aColumn))
                projectablesInADataset.add(aColumn);
        }
    }

    if (datasetToColumns.keySet().size() != this.datasets.length) {
        throw new IllegalArgumentException(
                "Please select at least one column from each dataset in the join/group-by job.");
    }

    // SETUP JOB
    if (this.userDefinedConf != null) {
        this.jobConf = new JobConf(Util.merge(this.jobConf, this.userDefinedConf));
    }
    this.jobConf.setJarByClass(job.getClass());
    this.jobConf.setMapOutputKeyClass(DataJoinKey.class);
    this.jobConf.setMapOutputValueClass(DataJoinValue.class);
    this.jobConf.setPartitionerClass(DataJoinKeyPartitioner.class);
    this.jobConf.setOutputValueGroupingComparator(DataJoinKey.Comparator.class);
    this.jobConf.setOutputKeyComparatorClass(DataJoinKey.class);
    this.jobConf.setReducerClass(DefaultMobiusReducer.class);
    this.jobConf.set(ConfigureConstants.PROJECTION_COLUMNS, SerializableUtil.serializeToBase64(projections));

    JobSetup.setupOutputs(this.jobConf, output, outputFormat);

    // setup input paths, projection columns for each datasets.
    for (byte assignedDatasetID = 0; assignedDatasetID < this.datasets.length; assignedDatasetID++) {
        Dataset aDataset = this.datasets[assignedDatasetID];

        // setup input for each dataset
        JobSetup.setupInputs(jobConf, aDataset, assignedDatasetID);

        // setup projection for each dataset
        JobSetup.setupProjections(jobConf, aDataset, assignedDatasetID,
                datasetToColumns.get(aDataset).toArray(new Column[0]));
    }

    // setup all dataset IDs
    for (int i = 0; i < this.datasets.length; i++) {
        Byte id = this.datasets[i].getID();
        if (!this.jobConf.get(ConfigureConstants.ALL_DATASET_IDS, "").isEmpty()) {
            this.jobConf.set(ConfigureConstants.ALL_DATASET_IDS,
                    this.jobConf.get(ConfigureConstants.ALL_DATASET_IDS) + "," + id);
        } else {
            this.jobConf.set(ConfigureConstants.ALL_DATASET_IDS, id.toString());
        }
    }

    boolean isCombinable = true;
    for (Projectable aFunc : projections) {
        aFunc.setConf(jobConf);

        if (!aFunc.isCombinable()) {
            isCombinable = false;
            LOGGER.info(aFunc.toString() + " is not combinable, #isCombinable() return false.");
            break;
        }
        if (aFunc instanceof GroupFunction && aFunc.useGroupKeyOnly()) {
            LOGGER.info(aFunc.toString()
                    + " is a group function and use group key as its input only, disable combiner.");
            isCombinable = false;
            break;
        }
    }

    LOGGER.info("Using Combiner? " + isCombinable);
    if (isCombinable) {
        jobConf.setCombinerClass(DefaultMobiusCombiner.class);
    }

    job.addToExecQueue(jobConf);

    AbstractDatasetBuilder builder = DatasetBuildersFactory.getInstance(job).getBuilder(outputFormat,
            "Dataset_" + output.getName());

    // form the output column from the projections
    List<String> outputColumns = new ArrayList<String>();
    for (Projectable func : projections) {
        String[] aProjectOutputs = func.getOutputSchema();
        for (String anOutputName : aProjectOutputs) {
            outputColumns.add(anOutputName);
        }
    }

    return builder.buildFromPreviousJob(jobConf, outputFormat, outputColumns.toArray(new String[0]));
}

From source file:com.ebay.erl.mobius.core.SortPersistable.java

License:Apache License

/**
 * Save the sort result to the given <code>output</code> with
 * the specified <code>outputFormat</code>.
 * <p>/*from   w  w  w  .j  a va 2s.c  o m*/
 * 
 * The returned {@link Dataset} represents the sorted result,
 * it can be used to do further analysis.
 */
public Dataset save(MobiusJob job, Path output, Class<? extends FileOutputFormat> outputFormat)
        throws IOException {
    // SETUP JOB
    if (this.userDefinedConf != null) {
        this.jobConf = new JobConf(Util.merge(this.jobConf, this.userDefinedConf));
    }
    this.jobConf.setJarByClass(job.getClass());
    this.jobConf.setMapOutputKeyClass(DataJoinKey.class);
    this.jobConf.setMapOutputValueClass(DataJoinValue.class);
    this.jobConf.setPartitionerClass(TotalOrderPartitioner.class);
    this.jobConf.setOutputKeyComparatorClass(DataJoinKey.class);
    this.jobConf.setReducerClass(TotalSortReducer.class);

    JobSetup.setupOutputs(this.jobConf, output, outputFormat);

    job.addToExecQueue(this.jobConf);

    AbstractDatasetBuilder builder = DatasetBuildersFactory.getInstance(job).getBuilder(outputFormat,
            "Dataset_" + output.getName());

    // form the output column from the projections
    List<String> outputColumns = new ArrayList<String>();
    for (Projectable func : projections) {
        String[] aProjectOutputs = func.getOutputSchema();
        for (String anOutputName : aProjectOutputs) {
            outputColumns.add(anOutputName);
        }
    }

    return builder.buildFromPreviousJob(jobConf, outputFormat, outputColumns.toArray(new String[0]));
}