Example usage for org.apache.hadoop.mapreduce Job Job

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job Job.

Prototype

Job(JobConf conf) throws IOException

Source Link

Usage

From source file:com.blackberry.logdriver.util.Search.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf(); // Configuration processed by ToolRunner
    // If run by Oozie, then load the Oozie conf too
    if (System.getProperty("oozie.action.conf.xml") != null) {
        conf.addResource(new URL("file://" + System.getProperty("oozie.action.conf.xml")));
    }/*from  ww w  .  j a  va2 s .c  om*/

    FileSystem fs = FileSystem.get(conf);

    // The command line options
    String searchString = null;
    List<Path> paths = new ArrayList<Path>();
    Path outputDir = null;

    // Load input files from the command line
    if (args.length < 3) {
        System.out.println("usage: [genericOptions] searchString input [input ...] output");
        System.exit(1);
    }

    // Get the files we need from the command line.
    searchString = args[0];
    for (int i = 1; i < args.length - 1; i++) {
        for (FileStatus f : fs.globStatus(new Path(args[i]))) {
            paths.add(f.getPath());
        }
    }
    outputDir = new Path(args[args.length - 1]);

    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    Configuration jobConf = job.getConfiguration();

    job.setJarByClass(Search.class);
    jobConf.setIfUnset("mapred.job.name", "Search Files");

    // To propagate credentials within Oozie
    if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
        jobConf.set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
    }

    // Good output separators include things that are unsupported by XML. So we
    // just send the byte value of the character through. The restriction here
    // is that it can't be more than 1 byte when UTF-8 encoded, since it will be
    // read by Pig which only deals with single byte separators.
    {
        String outputSeparator = jobConf.get("logdriver.output.field.separator", DEFAULT_OUTPUT_SEPARATOR);
        byte[] bytes = outputSeparator.getBytes(UTF_8);
        if (bytes.length != 1) {
            LOG.error("The output separator must be a single byte in UTF-8.");
            return 1;
        }

        jobConf.set("logdriver.output.field.separator", Byte.toString(bytes[0]));
    }

    jobConf.set("logdriver.search.string", searchString);

    job.setInputFormatClass(BoomInputFormat.class);
    job.setMapperClass(SearchMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(NullWritable.class);

    job.setNumReduceTasks(0);

    // And set the output as usual
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, outputDir);
    for (Path path : paths) {
        BoomInputFormat.addInputPath(job, path);
    }

    // Run the job.
    if (conf.getBoolean("job.wait", DEFAULT_WAIT_JOB)) {
        return job.waitForCompletion(true) ? 0 : 1;
    } else {
        job.submit();
        return 0;
    }
}

From source file:com.boozallen.cognition.accumulo.config.AccumuloConfiguration.java

License:Apache License

public AccumuloConfiguration(Instance instance, String accumuloUser, String accumuloPassword, boolean isMock)
        throws AccumuloSecurityException, IOException {
    //NOTE: new Job(new Configuration()) does not work in scala shell due to the toString method's implementation
    //to get it to work in scala override the toString method and it will work

    //initialize fields, these are needed for lazy initialization of connector
    this.zkInstance = instance;
    this.accumuloUser = accumuloUser;
    this.accumuloPassword = accumuloPassword;

    this.job = new Job(new Configuration());
    AbstractInputFormat.setConnectorInfo(job, accumuloUser, new PasswordToken(accumuloPassword));
    AccumuloOutputFormat.setConnectorInfo(job, accumuloUser, new PasswordToken(accumuloPassword));

    AbstractInputFormat.setScanAuthorizations(job, new Authorizations());

    if (isMock) {
        AbstractInputFormat.setMockInstance(job, instance.getInstanceName());
        AccumuloOutputFormat.setMockInstance(job, instance.getInstanceName());
    } else {/*from  w w  w . ja v  a  2  s  .  c  o m*/

        this.clientConfig = new ClientConfiguration();
        this.clientConfig.withInstance(instance.getInstanceName());
        this.clientConfig.withZkHosts(instance.getZooKeepers());

        AbstractInputFormat.setZooKeeperInstance(job, clientConfig);
        AccumuloOutputFormat.setZooKeeperInstance(job, this.clientConfig);
    }
}

From source file:com.cloudera.avro.MapReduceAvroWordCount.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: AvroWordCount <input path> <output path>");
        return -1;
    }/*  w ww .  j a v a  2 s .  co m*/

    Job job = new Job(getConf());
    job.setJarByClass(MapReduceAvroWordCount.class);
    job.setJobName("wordcount");

    // We call setOutputSchema first so we can override the configuration
    // parameters it sets
    AvroJob.setOutputKeySchema(job, Pair.getPairSchema(Schema.create(Type.STRING), Schema.create(Type.INT)));
    job.setOutputValueClass(NullWritable.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(TextInputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setSortComparatorClass(Text.Comparator.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);

    return 0;
}

From source file:com.cloudera.avro.MapReduceColorCount.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: MapReduceColorCount <input path> <output path>");
        return -1;
    }/*from w w  w  .j a  v a2  s  .  co m*/

    Job job = new Job(getConf());
    job.setJarByClass(MapReduceColorCount.class);
    job.setJobName("Color Count");

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setInputFormatClass(AvroKeyInputFormat.class);
    job.setMapperClass(ColorCountMapper.class);
    AvroJob.setInputKeySchema(job, User.getClassSchema());
    AvroJob.setMapOutputValueSchema(job, User.getClassSchema());

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
    job.setReducerClass(ColorCountReducer.class);
    AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING));
    AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT));

    return (job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.cloudera.crunch.impl.mr.plan.JobPrototype.java

License:Open Source License

private CrunchJob build(Class<?> jarClass, Configuration conf) throws IOException {
    Job job = new Job(conf);
    conf = job.getConfiguration();/*  ww  w .j  a va 2s.c o m*/
    job.setJarByClass(jarClass);

    Set<DoNode> outputNodes = Sets.newHashSet();
    Set<Target> targets = targetsToNodePaths.keySet();
    MSCROutputHandler outputHandler = new MSCROutputHandler(job, workingPath, group == null);
    for (Target target : targets) {
        DoNode node = null;
        for (NodePath nodePath : targetsToNodePaths.get(target)) {
            if (node == null) {
                PCollectionImpl collect = nodePath.tail();
                node = DoNode.createOutputNode(target.toString(), collect.getPType());
                outputHandler.configureNode(node, target);
            }
            outputNodes.add(walkPath(nodePath.descendingIterator(), node));
        }
    }

    job.setMapperClass(CrunchMapper.class);
    List<DoNode> inputNodes;
    DoNode reduceNode = null;
    RTNodeSerializer serializer = new RTNodeSerializer();
    if (group != null) {
        job.setReducerClass(CrunchReducer.class);
        List<DoNode> reduceNodes = Lists.newArrayList(outputNodes);
        reduceNode = reduceNodes.get(0);
        serializer.serialize(reduceNodes, conf, NodeContext.REDUCE);

        group.configureShuffle(job);

        DoNode mapOutputNode = group.getGroupingNode();
        if (reduceNodes.size() == 1 && combineFnTable != null) {
            // Handle the combiner case
            DoNode mapSideCombineNode = combineFnTable.createDoNode();
            mapSideCombineNode.addChild(mapOutputNode);
            mapOutputNode = mapSideCombineNode;
        }

        Set<DoNode> mapNodes = Sets.newHashSet();
        for (NodePath nodePath : mapNodePaths) {
            // Advance these one step, since we've already configured
            // the grouping node, and the PGroupedTableImpl is the tail
            // of the NodePath.
            Iterator<PCollectionImpl> iter = nodePath.descendingIterator();
            iter.next();
            mapNodes.add(walkPath(iter, mapOutputNode));
        }
        inputNodes = Lists.newArrayList(mapNodes);
        serializer.serialize(inputNodes, conf, NodeContext.MAP);
    } else { // No grouping
        job.setNumReduceTasks(0);
        inputNodes = Lists.newArrayList(outputNodes);
        serializer.serialize(inputNodes, conf, NodeContext.MAP);
    }

    if (inputNodes.size() == 1) {
        DoNode inputNode = inputNodes.get(0);
        inputNode.getSource().configureSource(job, -1);
    } else {
        for (int i = 0; i < inputNodes.size(); i++) {
            DoNode inputNode = inputNodes.get(i);
            inputNode.getSource().configureSource(job, i);
        }
        job.setInputFormatClass(CrunchInputFormat.class);
    }
    job.setJobName(createJobName(inputNodes, reduceNode));

    return new CrunchJob(job, workingPath, outputHandler);
}

From source file:com.cloudera.crunch.impl.mr.run.CrunchInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    List<InputSplit> splits = Lists.newArrayList();
    Configuration conf = job.getConfiguration();
    Job jobCopy = new Job(conf);
    Map<Class<? extends InputFormat>, Map<Integer, List<Path>>> formatNodeMap = CrunchInputs
            .getFormatNodeMap(jobCopy);//from   ww  w  .ja  v  a 2  s.c om

    // First, build a map of InputFormats to Paths
    for (Map.Entry<Class<? extends InputFormat>, Map<Integer, List<Path>>> entry : formatNodeMap.entrySet()) {
        Class<? extends InputFormat> formatClass = entry.getKey();
        InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf);
        for (Map.Entry<Integer, List<Path>> nodeEntry : entry.getValue().entrySet()) {
            Integer nodeIndex = nodeEntry.getKey();
            List<Path> paths = nodeEntry.getValue();
            FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()]));

            // Get splits for each input path and tag with InputFormat
            // and Mapper types by wrapping in a TaggedInputSplit.
            List<InputSplit> pathSplits = format.getSplits(jobCopy);
            for (InputSplit pathSplit : pathSplits) {
                splits.add(new CrunchInputSplit(pathSplit, formatClass, nodeIndex, conf));
            }
        }
    }
    return splits;
}

From source file:com.cloudera.recordservice.pig.HCatRSLoader.java

License:Apache License

@Override
public void setLocation(String location, Job job) throws IOException {
    HCatContext.INSTANCE.setConf(job.getConfiguration()).getConf().get()
            .setBoolean(HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION, true);
    UDFContext udfContext = UDFContext.getUDFContext();
    Properties udfProps = udfContext.getUDFProperties(this.getClass(), new String[] { signature });
    job.getConfiguration().set(INNER_SIGNATURE, INNER_SIGNATURE_PREFIX + "_" + signature);

    RequiredFieldList requiredFieldsInfo = (RequiredFieldList) udfProps.get(PRUNE_PROJECTION_INFO);
    // get partitionFilterString stored in the UDFContext - it would have
    // been stored there by an earlier call to setPartitionFilter
    // call setInput on HCatInputFormat only in the frontend because internally
    // it makes calls to the hcat server - we don't want these to happen in
    // the backend
    // in the hadoop front end mapred.task.id property will not be set in
    // the Configuration
    if (udfProps.containsKey(HCatConstants.HCAT_PIG_LOADER_LOCATION_SET)) {
        for (Enumeration<Object> emr = udfProps.keys(); emr.hasMoreElements();) {
            PigHCatUtil.getConfigFromUDFProperties(udfProps, job.getConfiguration(),
                    emr.nextElement().toString());
        }//from   www . j  a  v  a  2 s.  c o  m
        if (!HCatUtil.checkJobContextIfRunningFromBackend(job)) {
            //Combine credentials and credentials from job takes precedence for freshness
            Credentials crd = jobCredentials.get(INNER_SIGNATURE_PREFIX + "_" + signature);
            job.getCredentials().addAll(crd);
        }
    } else {
        Job clone = new Job(job.getConfiguration());
        HCatRSInputFormat.setInput(job, location, getPartitionFilterString());
        InputJobInfo inputJobInfo = (InputJobInfo) HCatRSUtil
                .deserialize(job.getConfiguration().get(HCatConstants.HCAT_KEY_JOB_INFO));

        // TODO: Add back special cases call when I find out where the code has moved.
        addSpecialCasesParametersForHCatLoader(job.getConfiguration(), inputJobInfo.getTableInfo());

        // We will store all the new /changed properties in the job in the
        // udf context, so the the HCatInputFormat.setInput method need not
        //be called many times.
        for (Entry<String, String> keyValue : job.getConfiguration()) {
            String oldValue = clone.getConfiguration().getRaw(keyValue.getKey());
            if ((oldValue == null) || (keyValue.getValue().equals(oldValue) == false)) {
                udfProps.put(keyValue.getKey(), keyValue.getValue());
            }
        }
        udfProps.put(HCatConstants.HCAT_PIG_LOADER_LOCATION_SET, true);
        //Store credentials in a private hash map and not the udf context to
        // make sure they are not public.
        Credentials crd = new Credentials();
        crd.addAll(job.getCredentials());
        jobCredentials.put(INNER_SIGNATURE_PREFIX + "_" + signature, crd);
        clone.setInputFormatClass(HCatRSInputFormat.class);
    }

    // Need to also push projections by calling setOutputSchema on
    // HCatInputFormat - we have to get the RequiredFields information
    // from the UdfContext, translate it to an Schema and then pass it
    // The reason we do this here is because setLocation() is called by
    // Pig runtime at InputFormat.getSplits() and
    // InputFormat.createRecordReader() time - we are not sure when
    // HCatInputFormat needs to know about pruned projections - so doing it
    // here will ensure we communicate to HCatInputFormat about pruned
    // projections at getSplits() and createRecordReader() time

    if (requiredFieldsInfo != null) {
        // convert to hcatschema and pass to HCatInputFormat
        try {
            outputSchema = phutil.getHCatSchema(requiredFieldsInfo.getFields(), signature, this.getClass());
            HCatRSInputFormat.setOutputSchema(job, outputSchema);
        } catch (Exception e) {
            throw new IOException(e);
        }
    } else {
        // else - this means pig's optimizer never invoked the pushProjection
        // method - so we need all fields and hence we should not call the
        // setOutputSchema on HCatInputFormat
        if (HCatUtil.checkJobContextIfRunningFromBackend(job)) {
            try {
                HCatSchema hcatTableSchema = (HCatSchema) udfProps.get(HCatConstants.HCAT_TABLE_SCHEMA);
                outputSchema = hcatTableSchema;
                HCatRSInputFormat.setOutputSchema(job, outputSchema);
            } catch (Exception e) {
                throw new IOException(e);
            }
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("outputSchema=" + outputSchema);
    }
    job.setInputFormatClass(HCatRSInputFormat.class);
}

From source file:com.cloudera.spark.bulkload.TotalOrderPartitioner.java

License:Apache License

/**
   * Read in the partition file and build indexing data structures.
   * If the keytype is {@link BinaryComparable} and
   * <tt>total.order.partitioner.natural.order</tt> is not false, a trie
   * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes
   * will be built. Otherwise, keys will be located using a binary search of
   * the partition keyset using the {@link RawComparator}
   * defined for this job. The input file must be sorted with the same
   * comparator and contain {@link Job#getNumReduceTasks()} - 1 keys.
   *///w ww.j  a v  a 2  s  .  c o  m
  @SuppressWarnings("unchecked") // keytype from conf not static
  public void setConf(Configuration conf) {
      try {
          this.conf = conf;
          String parts = getPartitionFile(conf);
          final Path partFile = new Path(parts);
          final FileSystem fs = (DEFAULT_PATH.equals(parts)) ? FileSystem.getLocal(conf) // assume in DistributedCache
                  : partFile.getFileSystem(conf);

          Job job = new Job(conf);
          Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass();
          K[] splitPoints = readPartitions(fs, partFile, keyClass, conf);
          if (splitPoints.length != job.getNumReduceTasks() - 1) {
              throw new IOException("Wrong number of partitions in keyset");
          }
          RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator();
          for (int i = 0; i < splitPoints.length - 1; ++i) {
              if (comparator.compare(splitPoints[i], splitPoints[i + 1]) >= 0) {
                  throw new IOException("Split points are out of order");
              }
          }
          boolean natOrder = conf.getBoolean(NATURAL_ORDER, true);
          if (natOrder && BinaryComparable.class.isAssignableFrom(keyClass)) {
              partitions = buildTrie((BinaryComparable[]) splitPoints, 0, splitPoints.length, new byte[0],
                      // Now that blocks of identical splitless trie nodes are 
                      // represented reentrantly, and we develop a leaf for any trie
                      // node with only one split point, the only reason for a depth
                      // limit is to refute stack overflow or bloat in the pathological
                      // case where the split points are long and mostly look like bytes 
                      // iii...iixii...iii   .  Therefore, we make the default depth
                      // limit large but not huge.
                      conf.getInt(MAX_TRIE_DEPTH, 200));
          } else {
              partitions = new BinarySearchNode(splitPoints, comparator);
          }
      } catch (IOException e) {
          throw new IllegalArgumentException("Can't read partitions file", e);
      }
  }

From source file:com.cloudera.sqoop.mapreduce.db.TestDataDrivenDBInputFormat.java

License:Apache License

public void testDateSplits() throws Exception {
    Statement s = connection.createStatement();
    final String DATE_TABLE = "datetable";
    final String COL = "foo";
    try {/*  w w w  .j ava  2 s  . c o  m*/
        try {
            // delete the table if it already exists.
            s.executeUpdate("DROP TABLE " + DATE_TABLE);
        } catch (SQLException e) {
            // Ignored; proceed regardless of whether we deleted the table;
            // it may have simply not existed.
        }

        // Create the table.
        s.executeUpdate("CREATE TABLE " + DATE_TABLE + "(" + COL + " TIMESTAMP)");
        s.executeUpdate("INSERT INTO " + DATE_TABLE + " VALUES('2010-04-01')");
        s.executeUpdate("INSERT INTO " + DATE_TABLE + " VALUES('2010-04-02')");
        s.executeUpdate("INSERT INTO " + DATE_TABLE + " VALUES('2010-05-01')");
        s.executeUpdate("INSERT INTO " + DATE_TABLE + " VALUES('2011-04-01')");

        // commit this tx.
        connection.commit();

        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "file:///");
        FileSystem fs = FileSystem.getLocal(conf);
        fs.delete(new Path(OUT_DIR), true);

        // now do a dd import
        Job job = new Job(conf);
        job.setMapperClass(ValMapper.class);
        job.setReducerClass(Reducer.class);
        job.setMapOutputKeyClass(DateCol.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(DateCol.class);
        job.setOutputValueClass(NullWritable.class);
        job.setNumReduceTasks(1);
        job.getConfiguration().setInt("mapreduce.map.tasks", 2);
        FileOutputFormat.setOutputPath(job, new Path(OUT_DIR));
        DBConfiguration.configureDB(job.getConfiguration(), DRIVER_CLASS, DB_URL, (String) null, (String) null);
        DataDrivenDBInputFormat.setInput(job, DateCol.class, DATE_TABLE, null, COL, COL);

        boolean ret = job.waitForCompletion(true);
        assertTrue("job failed", ret);

        // Check to see that we imported as much as we thought we did.
        assertEquals("Did not get all the records", 4, job.getCounters()
                .findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_OUTPUT_RECORDS").getValue());
    } finally {
        s.close();
    }
}

From source file:com.cloudera.sqoop.mapreduce.MergeJob.java

License:Apache License

public boolean runMergeJob() throws IOException {
    Configuration conf = options.getConf();
    Job job = new Job(conf);

    String userClassName = options.getClassName();
    if (null == userClassName) {
        // Shouldn't get here.
        throw new IOException("Record class name not specified with " + "--class-name.");
    }//  w  w  w . j  av  a 2s .c o  m

    // Set the external jar to use for the job.
    String existingJar = options.getExistingJarName();
    if (existingJar != null) {
        // User explicitly identified a jar path.
        LOG.debug("Setting job jar to user-specified jar: " + existingJar);
        job.getConfiguration().set("mapred.jar", existingJar);
    } else {
        // Infer it from the location of the specified class, if it's on the
        // classpath.
        try {
            Class<? extends Object> userClass = conf.getClassByName(userClassName);
            if (null != userClass) {
                String userJar = Jars.getJarPathForClass(userClass);
                LOG.debug("Setting job jar based on user class " + userClassName + ": " + userJar);
                job.getConfiguration().set("mapred.jar", userJar);
            } else {
                LOG.warn("Specified class " + userClassName + " is not in a jar. "
                        + "MapReduce may not find the class");
            }
        } catch (ClassNotFoundException cnfe) {
            throw new IOException(cnfe);
        }
    }

    try {
        Path oldPath = new Path(options.getMergeOldPath());
        Path newPath = new Path(options.getMergeNewPath());

        Configuration jobConf = job.getConfiguration();
        FileSystem fs = FileSystem.get(jobConf);
        oldPath = oldPath.makeQualified(fs);
        newPath = newPath.makeQualified(fs);

        FileInputFormat.addInputPath(job, oldPath);
        FileInputFormat.addInputPath(job, newPath);

        jobConf.set(MERGE_OLD_PATH_KEY, oldPath.toString());
        jobConf.set(MERGE_NEW_PATH_KEY, newPath.toString());
        jobConf.set(MERGE_KEY_COL_KEY, options.getMergeKeyCol());
        jobConf.set(MERGE_SQOOP_RECORD_KEY, userClassName);

        FileOutputFormat.setOutputPath(job, new Path(options.getTargetDir()));

        if (ExportJobBase.isSequenceFiles(jobConf, newPath)) {
            job.setInputFormatClass(SequenceFileInputFormat.class);
            job.setOutputFormatClass(SequenceFileOutputFormat.class);
            job.setMapperClass(MergeRecordMapper.class);
        } else {
            job.setMapperClass(MergeTextMapper.class);
            job.setOutputFormatClass(RawKeyTextOutputFormat.class);
        }

        jobConf.set("mapred.output.key.class", userClassName);
        job.setOutputValueClass(NullWritable.class);

        job.setReducerClass(MergeReducer.class);

        // Set the intermediate data types.
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(MergeRecord.class);

        // Make sure Sqoop and anything else we need is on the classpath.
        cacheJars(job, null);
        return this.runJob(job);
    } catch (InterruptedException ie) {
        throw new IOException(ie);
    } catch (ClassNotFoundException cnfe) {
        throw new IOException(cnfe);
    }
}