Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:net.thevis.groovyhadoop.backport.CombineFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {

    long minSizeNode = 0;
    long minSizeRack = 0;
    long maxSize = 0;
    Configuration conf = job.getConfiguration();

    // the values specified by setxxxSplitSize() takes precedence over the
    // values that might have been specified in the config
    if (minSplitSizeNode != 0) {
        minSizeNode = minSplitSizeNode;// w  w  w. j  av  a2 s.  c om
    } else {
        minSizeNode = conf.getLong(SPLIT_MINSIZE_PERNODE, 0);
    }
    if (minSplitSizeRack != 0) {
        minSizeRack = minSplitSizeRack;
    } else {
        minSizeRack = conf.getLong(SPLIT_MINSIZE_PERRACK, 0);
    }
    if (maxSplitSize != 0) {
        maxSize = maxSplitSize;
    } else {
        maxSize = conf.getLong("mapreduce.input.fileinputformat.split.maxsize", 0);
    }
    if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
        throw new IOException("Minimum split size pernode " + minSizeNode
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
        throw new IOException("Minimum split size per rack" + minSizeRack
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && minSizeNode > minSizeRack) {
        throw new IOException("Minimum split size per node" + minSizeNode
                + " cannot be smaller than minimum split " + "size per rack " + minSizeRack);
    }

    // all the files in input set
    Path[] paths = FileUtil.stat2Paths(listStatus(job).toArray(new FileStatus[0]));
    List<InputSplit> splits = new ArrayList<InputSplit>();
    if (paths.length == 0) {
        return splits;
    }

    // Convert them to Paths first. This is a costly operation and 
    // we should do it first, otherwise we will incur doing it multiple
    // times, one time each for each pool in the next loop.
    List<Path> newpaths = new LinkedList<Path>();
    for (int i = 0; i < paths.length; i++) {
        Path p = new Path(paths[i].toUri().getPath());
        newpaths.add(p);
    }
    paths = null;

    // In one single iteration, process all the paths in a single pool.
    // Processing one pool at a time ensures that a split contains paths
    // from a single pool only.
    for (MultiPathFilter onepool : pools) {
        ArrayList<Path> myPaths = new ArrayList<Path>();

        // pick one input path. If it matches all the filters in a pool,
        // add it to the output set
        for (Iterator<Path> iter = newpaths.iterator(); iter.hasNext();) {
            Path p = iter.next();
            if (onepool.accept(p)) {
                myPaths.add(p); // add it to my output set
                iter.remove();
            }
        }
        // create splits for all files in this pool.
        getMoreSplits(conf, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack,
                splits);
    }

    // create splits for all files that are not in any pool.
    getMoreSplits(conf, newpaths.toArray(new Path[newpaths.size()]), maxSize, minSizeNode, minSizeRack, splits);

    // free up rackToNodes map
    rackToNodes.clear();
    return splits;
}

From source file:nl.basjes.hadoop.input.ApacheHttpdLogfileInputFormat.java

License:Apache License

@Override
protected boolean isSplitable(JobContext context, Path file) {
    final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
    return (null == codec) || codec instanceof SplittableCompressionCodec;
}

From source file:nl.gridline.zieook.runners.cf.RecommenderJobZieOok.java

License:Apache License

private static void setIOSort(JobContext job) {
    Configuration conf = job.getConfiguration();
    conf.setInt("io.sort.factor", 100);
    int assumedHeapSize = 512;
    String javaOpts = conf.get("mapred.child.java.opts");
    if (javaOpts != null) {
        Matcher m = Pattern.compile("-Xmx([0-9]+)([mMgG])").matcher(javaOpts);
        if (m.find()) {
            assumedHeapSize = Integer.parseInt(m.group(1));
            String megabyteOrGigabyte = m.group(2);
            if ("g".equalsIgnoreCase(megabyteOrGigabyte)) {
                assumedHeapSize *= 1024;
            }/*from   w  ww . j  a  v  a 2s  .com*/
        }
    }
    conf.setInt("io.sort.mb", assumedHeapSize / 2);
    // For some reason the Merger doesn't report status for a long time; increase
    // timeout when running these jobs
    conf.setInt("mapred.task.timeout", 60 * 60 * 1000);
}

From source file:nyu.cs.webgraph.MRhelpers.LzoTabSeperatedTextInputFormat.java

License:Open Source License

@Override
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> files = super.listStatus(job);

    String fileExtension = new LzopCodec().getDefaultExtension();
    Configuration conf = job.getConfiguration();

    for (Iterator<FileStatus> iterator = files.iterator(); iterator.hasNext();) {
        FileStatus fileStatus = iterator.next();
        Path file = fileStatus.getPath();
        FileSystem fs = file.getFileSystem(conf);

        if (!file.toString().endsWith(fileExtension)) {
            //get rid of non lzo files
            iterator.remove();// www  . j  a v  a  2  s.c o m
        } else {
            //read the index file
            LzoIndex index = LzoIndex.readIndex(fs, file);
            indexes.put(file, index);
        }
    }

    return files;
}

From source file:nyu.cs.webgraph.MRhelpers.LzoTabSeperatedTextInputFormat.java

License:Open Source License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> splits = super.getSplits(job);
    Configuration conf = job.getConfiguration();
    // find new start/ends of the filesplit that aligns
    // with the lzo blocks

    List<InputSplit> result = new ArrayList<InputSplit>();

    for (InputSplit genericSplit : splits) {
        // load the index
        FileSplit fileSplit = (FileSplit) genericSplit;
        Path file = fileSplit.getPath();
        FileSystem fs = file.getFileSystem(conf);
        LzoIndex index = indexes.get(file);
        if (index == null) {
            throw new IOException("Index not found for " + file);
        }/*from w  w  w.  j  a v a 2s .  c o  m*/

        if (index.isEmpty()) {
            // empty index, keep as is
            result.add(fileSplit);
            continue;
        }

        long start = fileSplit.getStart();
        long end = start + fileSplit.getLength();

        long lzoStart = index.alignSliceStartToIndex(start, end);
        long lzoEnd = index.alignSliceEndToIndex(end, fs.getFileStatus(file).getLen());

        if (lzoStart != LzoIndex.NOT_FOUND && lzoEnd != LzoIndex.NOT_FOUND) {
            result.add(new FileSplit(file, lzoStart, lzoEnd - lzoStart, fileSplit.getLocations()));
        }
    }

    return result;
}

From source file:oracle.kv.hadoop.KVInputFormatBase.java

License:Open Source License

/**
 * @hidden//from w  w w  .j a  v a2  s  .c  o m
 * Logically split the set of input data for the job.
 *
 * @param context job configuration.
 *
 * @return an array of {@link InputSplit}s for the job.
 */
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {

    if (context != null) {
        final Configuration conf = context.getConfiguration();
        initializeParameters(conf);
    }

    if (kvStoreName == null) {
        throw new IllegalArgumentException(
                "No KV Store Name provided. Use either the " + ParamConstant.KVSTORE_NAME.getName()
                        + " parameter or call " + KVInputFormatBase.class.getName() + ".setKVStoreName().");
    }

    if (kvHelperHosts == null) {
        throw new IllegalArgumentException(
                "No KV Helper Hosts were provided. Use either the " + ParamConstant.KVSTORE_NODES.getName()
                        + " parameter or call " + KVInputFormatBase.class.getName() + ".setKVHelperHosts().");
    }

    final KVStoreLogin storeLogin = new KVStoreLogin(null, kvStoreSecurityFile);
    storeLogin.loadSecurityProperties();
    storeLogin.prepareRegistryCSF();
    LoginManager loginMgr = null;
    if (storeLogin.foundSSLTransport()) {
        loginMgr = KVStoreLogin.getRepNodeLoginMgr(kvHelperHosts, storeLogin.getLoginCredentials(),
                kvStoreName);
    }
    Topology topology = null;
    try {
        topology = TopologyLocator.get(kvHelperHosts, 0, loginMgr, kvStoreName);
    } catch (KVStoreException KVSE) {
        KVSE.printStackTrace();
        return null;
    }

    /* Create a set of splits based on shards and consistency */
    final SplitBuilder sb = new SplitBuilder(topology);

    final List<TopoSplit> splits = sb.createShardSplits(consistency);
    final List<InputSplit> ret = new ArrayList<InputSplit>(splits.size());
    final RegistryUtils regUtils = new RegistryUtils(topology, loginMgr);

    for (TopoSplit ts : splits) {
        if (ts.isEmpty()) {
            /* Split is empty, skip */
            continue;
        }

        final List<String> repNodeNames = new ArrayList<String>();
        final List<String> repNodeNamesAndPorts = new ArrayList<String>();

        for (StorageNode sn : ts.getSns(consistency, topology, regUtils)) {
            repNodeNames.add(sn.getHostname());
            repNodeNamesAndPorts.add(sn.getHostname() + ":" + sn.getRegistryPort());
        }

        ret.add(new KVInputSplit().setKVHelperHosts(repNodeNamesAndPorts.toArray(new String[0]))
                .setKVStoreName(kvStoreName).setKVStoreSecurityFile(storeLogin.getSecurityFilePath())
                .setLocations(repNodeNames.toArray(new String[0])).setDirection(direction)
                .setBatchSize(batchSize).setParentKey(parentKey).setSubRange(subRange).setDepth(depth)
                .setConsistency(consistency).setTimeout(timeout).setTimeoutUnit(timeoutUnit)
                .setFormatterClassName(formatterClassName).setPartitionSets(ts.getPartitionSets()));
    }

    return ret;
}

From source file:oracle.kv.hadoop.table.TableInputFormatBase.java

License:Open Source License

/**
 * @hidden/*from   w ww. jav a 2s. com*/
 * Logically split the set of input data for the job.
 *
 * @param context job configuration.
 *
 * @return an array of {@link InputSplit}s for the job.
 */
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {

    if (context != null) {
        final Configuration conf = context.getConfiguration();
        initializeParameters(conf);
    }

    if (kvStoreName == null) {
        throw new IllegalArgumentException(
                "No KV Store Name provided. Use either the " + ParamConstant.KVSTORE_NAME.getName()
                        + " parameter or call " + TableInputFormatBase.class.getName() + ".setKVStoreName().");
    }

    if (kvHelperHosts == null) {
        throw new IllegalArgumentException("No KV Helper Hosts were provided. Use either the "
                + ParamConstant.KVSTORE_NODES.getName() + " parameter or call "
                + TableInputFormatBase.class.getName() + ".setKVHelperHosts().");
    }

    if (kvHadoopHosts == null) {
        kvHadoopHosts = new String[kvHelperHosts.length];
        for (int i = 0; i < kvHelperHosts.length; i++) {
            /* Strip off the ':port' suffix */
            final String[] hostPort = (kvHelperHosts[i]).trim().split(":");
            kvHadoopHosts[i] = hostPort[0];
        }
    }

    if (tableName == null) {
        throw new IllegalArgumentException(
                "No Table Name provided. Use either the " + ParamConstant.TABLE_NAME.getName()
                        + " parameter or call " + TableInputFormatBase.class.getName() + ".setTableName().");
    }

    final String userName = (passwordCredentials == null ? null : passwordCredentials.getUsername());
    final KVStoreLogin storeLogin = new KVStoreLogin(userName, localLoginFile);
    storeLogin.loadSecurityProperties();
    storeLogin.prepareRegistryCSF();
    LoginManager loginMgr = null;

    if (storeLogin.foundSSLTransport()) {
        loginMgr = KVStoreLogin.getRepNodeLoginMgr(kvHelperHosts, passwordCredentials, kvStoreName);
    }

    /*
     * Retrieve the topology of the store.
     *
     * Note that if the same Hive CLI session is used to run queries that
     * must connect to different KVStores where one store is non-secure
     * and the other is secure, then if the most recent call to this method
     * invoked the code below to retrieve the topology from the secure
     * store, then the security information is stored in the system
     * properties and the state of the splits, and the client socket
     * factory used when communicating with the RMI registry while
     * retrieving the topology is configured for SSL communication. As
     * a result, if the current call to this method invokes the code below
     * to retrieve the topology of the non-secure store, and if the client
     * socket factory is not reconfigured for non-SSL communication, then
     * a KVServerException (wrapping a java.rmi.ConnectIOException) will
     * be encountered. To address this, KVStoreException is caught, the
     * client socket factory is reconfigured for non-SSL communication,
     * and the attempt to retrieve the topology is retried with no
     * security information.
     *
     * If both secure and non-secure attempts fail, then the stack trace
     * is sent to both the DataNode's stderr log file and the Hive CLI
     * display screen.
     */
    Topology topology;
    try {
        topology = TopologyLocator.get(kvHelperHosts, 0, loginMgr, kvStoreName);
    } catch (KVStoreException e) {

        if (passwordCredentials != null) {

            /* Retry with no security */
            LOG.debug("Failure on topology retrieval: attempt to "
                    + "communicate with RMI registry over SSL unsuccessful. "
                    + "Changing from SSLClientSocketFactory to " + "ClientSocketFactory and retrying ...");

            ClientSocketFactory.setRMIPolicy(null, kvStoreName);
            RegistryUtils.initRegistryCSF();
            try {
                topology = TopologyLocator.get(kvHelperHosts, 0, null, kvStoreName);
            } catch (KVStoreException e1) {
                e1.printStackTrace(); /* Send to DataNode's stderr file. */
                throw new IOException(e1); /* Send to Hive CLI. */
            }

        } else {
            e.printStackTrace(); /* Send to DataNode's stderr file. */
            throw new IOException(e); /* Send to Hive CLI. */
        }
    }

    /* Create splits based on the store's partitions or its shards. */
    final List<TopoSplitWrapper> splits = getSplitInfo(topology, consistency, queryBy, shardKeyPartitionId);

    final List<InputSplit> ret = new ArrayList<InputSplit>(splits.size());
    for (TopoSplitWrapper ts : splits) {

        final TableInputSplit split = new TableInputSplit();

        split.setKVStoreName(kvStoreName);
        split.setKVHelperHosts(kvHelperHosts);
        split.setLocations(kvHadoopHosts);
        split.setTableName(tableName);
        split.setKVStoreSecurity(loginFlnm, passwordCredentials, trustFlnm);
        split.setPrimaryKeyProperty(primaryKeyProperty);

        /* For MultiRowOptions */
        split.setFieldRangeProperty(fieldRangeProperty);

        /* For TableIteratorOptions */
        split.setDirection(direction);
        split.setConsistency(consistency);
        split.setTimeout(timeout);
        split.setTimeoutUnit(timeoutUnit);
        split.setMaxRequests(maxRequests);
        split.setBatchSize(batchSize);
        split.setMaxBatches(maxBatches);

        split.setPartitionSets(ts.getPartitionSets());
        split.setQueryInfo(queryBy, whereClause);
        split.setShardSet(ts.getShardSet());

        ret.add(split);
    }
    return ret;
}

From source file:org.apache.accumulo.core.client.mapreduce.AbstractInputFormat.java

License:Apache License

/**
 * Returns the name of the current classloader context set on this scanner
 *
 * @param job// w ww  . j av  a2s  .  com
 *          the Hadoop job instance to be configured
 * @return name of the current context
 * @since 1.8.0
 */
public static String getClassLoaderContext(JobContext job) {
    return InputConfigurator.getClassLoaderContext(CLASS, job.getConfiguration());
}

From source file:org.apache.accumulo.core.client.mapreduce.AbstractInputFormat.java

License:Apache License

/**
 * Determines if the connector has been configured.
 *
 * @param context/*  ww  w . j  av  a 2  s.c  om*/
 *          the Hadoop context for the configured job
 * @return true if the connector has been configured, false otherwise
 * @since 1.5.0
 * @see #setConnectorInfo(Job, String, AuthenticationToken)
 */
protected static Boolean isConnectorInfoSet(JobContext context) {
    return InputConfigurator.isConnectorInfoSet(CLASS, context.getConfiguration());
}

From source file:org.apache.accumulo.core.client.mapreduce.AbstractInputFormat.java

License:Apache License

/**
 * Gets the user name from the configuration.
 *
 * @param context//from  w  w w.  ja  va 2 s.  c  om
 *          the Hadoop context for the configured job
 * @return the user name
 * @since 1.5.0
 * @see #setConnectorInfo(Job, String, AuthenticationToken)
 */
protected static String getPrincipal(JobContext context) {
    return InputConfigurator.getPrincipal(CLASS, context.getConfiguration());
}