List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration
public Configuration getConfiguration();
From source file:net.thevis.groovyhadoop.backport.CombineFileInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSizeNode = 0; long minSizeRack = 0; long maxSize = 0; Configuration conf = job.getConfiguration(); // the values specified by setxxxSplitSize() takes precedence over the // values that might have been specified in the config if (minSplitSizeNode != 0) { minSizeNode = minSplitSizeNode;// w w w. j av a2 s. c om } else { minSizeNode = conf.getLong(SPLIT_MINSIZE_PERNODE, 0); } if (minSplitSizeRack != 0) { minSizeRack = minSplitSizeRack; } else { minSizeRack = conf.getLong(SPLIT_MINSIZE_PERRACK, 0); } if (maxSplitSize != 0) { maxSize = maxSplitSize; } else { maxSize = conf.getLong("mapreduce.input.fileinputformat.split.maxsize", 0); } if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) { throw new IOException("Minimum split size pernode " + minSizeNode + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) { throw new IOException("Minimum split size per rack" + minSizeRack + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && minSizeNode > minSizeRack) { throw new IOException("Minimum split size per node" + minSizeNode + " cannot be smaller than minimum split " + "size per rack " + minSizeRack); } // all the files in input set Path[] paths = FileUtil.stat2Paths(listStatus(job).toArray(new FileStatus[0])); List<InputSplit> splits = new ArrayList<InputSplit>(); if (paths.length == 0) { return splits; } // Convert them to Paths first. This is a costly operation and // we should do it first, otherwise we will incur doing it multiple // times, one time each for each pool in the next loop. List<Path> newpaths = new LinkedList<Path>(); for (int i = 0; i < paths.length; i++) { Path p = new Path(paths[i].toUri().getPath()); newpaths.add(p); } paths = null; // In one single iteration, process all the paths in a single pool. // Processing one pool at a time ensures that a split contains paths // from a single pool only. for (MultiPathFilter onepool : pools) { ArrayList<Path> myPaths = new ArrayList<Path>(); // pick one input path. If it matches all the filters in a pool, // add it to the output set for (Iterator<Path> iter = newpaths.iterator(); iter.hasNext();) { Path p = iter.next(); if (onepool.accept(p)) { myPaths.add(p); // add it to my output set iter.remove(); } } // create splits for all files in this pool. getMoreSplits(conf, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack, splits); } // create splits for all files that are not in any pool. getMoreSplits(conf, newpaths.toArray(new Path[newpaths.size()]), maxSize, minSizeNode, minSizeRack, splits); // free up rackToNodes map rackToNodes.clear(); return splits; }
From source file:nl.basjes.hadoop.input.ApacheHttpdLogfileInputFormat.java
License:Apache License
@Override protected boolean isSplitable(JobContext context, Path file) { final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file); return (null == codec) || codec instanceof SplittableCompressionCodec; }
From source file:nl.gridline.zieook.runners.cf.RecommenderJobZieOok.java
License:Apache License
private static void setIOSort(JobContext job) { Configuration conf = job.getConfiguration(); conf.setInt("io.sort.factor", 100); int assumedHeapSize = 512; String javaOpts = conf.get("mapred.child.java.opts"); if (javaOpts != null) { Matcher m = Pattern.compile("-Xmx([0-9]+)([mMgG])").matcher(javaOpts); if (m.find()) { assumedHeapSize = Integer.parseInt(m.group(1)); String megabyteOrGigabyte = m.group(2); if ("g".equalsIgnoreCase(megabyteOrGigabyte)) { assumedHeapSize *= 1024; }/*from w ww . j a v a 2s .com*/ } } conf.setInt("io.sort.mb", assumedHeapSize / 2); // For some reason the Merger doesn't report status for a long time; increase // timeout when running these jobs conf.setInt("mapred.task.timeout", 60 * 60 * 1000); }
From source file:nyu.cs.webgraph.MRhelpers.LzoTabSeperatedTextInputFormat.java
License:Open Source License
@Override protected List<FileStatus> listStatus(JobContext job) throws IOException { List<FileStatus> files = super.listStatus(job); String fileExtension = new LzopCodec().getDefaultExtension(); Configuration conf = job.getConfiguration(); for (Iterator<FileStatus> iterator = files.iterator(); iterator.hasNext();) { FileStatus fileStatus = iterator.next(); Path file = fileStatus.getPath(); FileSystem fs = file.getFileSystem(conf); if (!file.toString().endsWith(fileExtension)) { //get rid of non lzo files iterator.remove();// www . j a v a 2 s.c o m } else { //read the index file LzoIndex index = LzoIndex.readIndex(fs, file); indexes.put(file, index); } } return files; }
From source file:nyu.cs.webgraph.MRhelpers.LzoTabSeperatedTextInputFormat.java
License:Open Source License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> splits = super.getSplits(job); Configuration conf = job.getConfiguration(); // find new start/ends of the filesplit that aligns // with the lzo blocks List<InputSplit> result = new ArrayList<InputSplit>(); for (InputSplit genericSplit : splits) { // load the index FileSplit fileSplit = (FileSplit) genericSplit; Path file = fileSplit.getPath(); FileSystem fs = file.getFileSystem(conf); LzoIndex index = indexes.get(file); if (index == null) { throw new IOException("Index not found for " + file); }/*from w w w. j a v a 2s . c o m*/ if (index.isEmpty()) { // empty index, keep as is result.add(fileSplit); continue; } long start = fileSplit.getStart(); long end = start + fileSplit.getLength(); long lzoStart = index.alignSliceStartToIndex(start, end); long lzoEnd = index.alignSliceEndToIndex(end, fs.getFileStatus(file).getLen()); if (lzoStart != LzoIndex.NOT_FOUND && lzoEnd != LzoIndex.NOT_FOUND) { result.add(new FileSplit(file, lzoStart, lzoEnd - lzoStart, fileSplit.getLocations())); } } return result; }
From source file:oracle.kv.hadoop.KVInputFormatBase.java
License:Open Source License
/** * @hidden//from w w w .j a v a2 s .c o m * Logically split the set of input data for the job. * * @param context job configuration. * * @return an array of {@link InputSplit}s for the job. */ @Override public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { if (context != null) { final Configuration conf = context.getConfiguration(); initializeParameters(conf); } if (kvStoreName == null) { throw new IllegalArgumentException( "No KV Store Name provided. Use either the " + ParamConstant.KVSTORE_NAME.getName() + " parameter or call " + KVInputFormatBase.class.getName() + ".setKVStoreName()."); } if (kvHelperHosts == null) { throw new IllegalArgumentException( "No KV Helper Hosts were provided. Use either the " + ParamConstant.KVSTORE_NODES.getName() + " parameter or call " + KVInputFormatBase.class.getName() + ".setKVHelperHosts()."); } final KVStoreLogin storeLogin = new KVStoreLogin(null, kvStoreSecurityFile); storeLogin.loadSecurityProperties(); storeLogin.prepareRegistryCSF(); LoginManager loginMgr = null; if (storeLogin.foundSSLTransport()) { loginMgr = KVStoreLogin.getRepNodeLoginMgr(kvHelperHosts, storeLogin.getLoginCredentials(), kvStoreName); } Topology topology = null; try { topology = TopologyLocator.get(kvHelperHosts, 0, loginMgr, kvStoreName); } catch (KVStoreException KVSE) { KVSE.printStackTrace(); return null; } /* Create a set of splits based on shards and consistency */ final SplitBuilder sb = new SplitBuilder(topology); final List<TopoSplit> splits = sb.createShardSplits(consistency); final List<InputSplit> ret = new ArrayList<InputSplit>(splits.size()); final RegistryUtils regUtils = new RegistryUtils(topology, loginMgr); for (TopoSplit ts : splits) { if (ts.isEmpty()) { /* Split is empty, skip */ continue; } final List<String> repNodeNames = new ArrayList<String>(); final List<String> repNodeNamesAndPorts = new ArrayList<String>(); for (StorageNode sn : ts.getSns(consistency, topology, regUtils)) { repNodeNames.add(sn.getHostname()); repNodeNamesAndPorts.add(sn.getHostname() + ":" + sn.getRegistryPort()); } ret.add(new KVInputSplit().setKVHelperHosts(repNodeNamesAndPorts.toArray(new String[0])) .setKVStoreName(kvStoreName).setKVStoreSecurityFile(storeLogin.getSecurityFilePath()) .setLocations(repNodeNames.toArray(new String[0])).setDirection(direction) .setBatchSize(batchSize).setParentKey(parentKey).setSubRange(subRange).setDepth(depth) .setConsistency(consistency).setTimeout(timeout).setTimeoutUnit(timeoutUnit) .setFormatterClassName(formatterClassName).setPartitionSets(ts.getPartitionSets())); } return ret; }
From source file:oracle.kv.hadoop.table.TableInputFormatBase.java
License:Open Source License
/** * @hidden/*from w ww. jav a 2s. com*/ * Logically split the set of input data for the job. * * @param context job configuration. * * @return an array of {@link InputSplit}s for the job. */ @Override public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { if (context != null) { final Configuration conf = context.getConfiguration(); initializeParameters(conf); } if (kvStoreName == null) { throw new IllegalArgumentException( "No KV Store Name provided. Use either the " + ParamConstant.KVSTORE_NAME.getName() + " parameter or call " + TableInputFormatBase.class.getName() + ".setKVStoreName()."); } if (kvHelperHosts == null) { throw new IllegalArgumentException("No KV Helper Hosts were provided. Use either the " + ParamConstant.KVSTORE_NODES.getName() + " parameter or call " + TableInputFormatBase.class.getName() + ".setKVHelperHosts()."); } if (kvHadoopHosts == null) { kvHadoopHosts = new String[kvHelperHosts.length]; for (int i = 0; i < kvHelperHosts.length; i++) { /* Strip off the ':port' suffix */ final String[] hostPort = (kvHelperHosts[i]).trim().split(":"); kvHadoopHosts[i] = hostPort[0]; } } if (tableName == null) { throw new IllegalArgumentException( "No Table Name provided. Use either the " + ParamConstant.TABLE_NAME.getName() + " parameter or call " + TableInputFormatBase.class.getName() + ".setTableName()."); } final String userName = (passwordCredentials == null ? null : passwordCredentials.getUsername()); final KVStoreLogin storeLogin = new KVStoreLogin(userName, localLoginFile); storeLogin.loadSecurityProperties(); storeLogin.prepareRegistryCSF(); LoginManager loginMgr = null; if (storeLogin.foundSSLTransport()) { loginMgr = KVStoreLogin.getRepNodeLoginMgr(kvHelperHosts, passwordCredentials, kvStoreName); } /* * Retrieve the topology of the store. * * Note that if the same Hive CLI session is used to run queries that * must connect to different KVStores where one store is non-secure * and the other is secure, then if the most recent call to this method * invoked the code below to retrieve the topology from the secure * store, then the security information is stored in the system * properties and the state of the splits, and the client socket * factory used when communicating with the RMI registry while * retrieving the topology is configured for SSL communication. As * a result, if the current call to this method invokes the code below * to retrieve the topology of the non-secure store, and if the client * socket factory is not reconfigured for non-SSL communication, then * a KVServerException (wrapping a java.rmi.ConnectIOException) will * be encountered. To address this, KVStoreException is caught, the * client socket factory is reconfigured for non-SSL communication, * and the attempt to retrieve the topology is retried with no * security information. * * If both secure and non-secure attempts fail, then the stack trace * is sent to both the DataNode's stderr log file and the Hive CLI * display screen. */ Topology topology; try { topology = TopologyLocator.get(kvHelperHosts, 0, loginMgr, kvStoreName); } catch (KVStoreException e) { if (passwordCredentials != null) { /* Retry with no security */ LOG.debug("Failure on topology retrieval: attempt to " + "communicate with RMI registry over SSL unsuccessful. " + "Changing from SSLClientSocketFactory to " + "ClientSocketFactory and retrying ..."); ClientSocketFactory.setRMIPolicy(null, kvStoreName); RegistryUtils.initRegistryCSF(); try { topology = TopologyLocator.get(kvHelperHosts, 0, null, kvStoreName); } catch (KVStoreException e1) { e1.printStackTrace(); /* Send to DataNode's stderr file. */ throw new IOException(e1); /* Send to Hive CLI. */ } } else { e.printStackTrace(); /* Send to DataNode's stderr file. */ throw new IOException(e); /* Send to Hive CLI. */ } } /* Create splits based on the store's partitions or its shards. */ final List<TopoSplitWrapper> splits = getSplitInfo(topology, consistency, queryBy, shardKeyPartitionId); final List<InputSplit> ret = new ArrayList<InputSplit>(splits.size()); for (TopoSplitWrapper ts : splits) { final TableInputSplit split = new TableInputSplit(); split.setKVStoreName(kvStoreName); split.setKVHelperHosts(kvHelperHosts); split.setLocations(kvHadoopHosts); split.setTableName(tableName); split.setKVStoreSecurity(loginFlnm, passwordCredentials, trustFlnm); split.setPrimaryKeyProperty(primaryKeyProperty); /* For MultiRowOptions */ split.setFieldRangeProperty(fieldRangeProperty); /* For TableIteratorOptions */ split.setDirection(direction); split.setConsistency(consistency); split.setTimeout(timeout); split.setTimeoutUnit(timeoutUnit); split.setMaxRequests(maxRequests); split.setBatchSize(batchSize); split.setMaxBatches(maxBatches); split.setPartitionSets(ts.getPartitionSets()); split.setQueryInfo(queryBy, whereClause); split.setShardSet(ts.getShardSet()); ret.add(split); } return ret; }
From source file:org.apache.accumulo.core.client.mapreduce.AbstractInputFormat.java
License:Apache License
/** * Returns the name of the current classloader context set on this scanner * * @param job// w ww . j av a2s . com * the Hadoop job instance to be configured * @return name of the current context * @since 1.8.0 */ public static String getClassLoaderContext(JobContext job) { return InputConfigurator.getClassLoaderContext(CLASS, job.getConfiguration()); }
From source file:org.apache.accumulo.core.client.mapreduce.AbstractInputFormat.java
License:Apache License
/** * Determines if the connector has been configured. * * @param context/* ww w . j av a 2 s.c om*/ * the Hadoop context for the configured job * @return true if the connector has been configured, false otherwise * @since 1.5.0 * @see #setConnectorInfo(Job, String, AuthenticationToken) */ protected static Boolean isConnectorInfoSet(JobContext context) { return InputConfigurator.isConnectorInfoSet(CLASS, context.getConfiguration()); }
From source file:org.apache.accumulo.core.client.mapreduce.AbstractInputFormat.java
License:Apache License
/** * Gets the user name from the configuration. * * @param context//from w w w. ja va 2 s. c om * the Hadoop context for the configured job * @return the user name * @since 1.5.0 * @see #setConnectorInfo(Job, String, AuthenticationToken) */ protected static String getPrincipal(JobContext context) { return InputConfigurator.getPrincipal(CLASS, context.getConfiguration()); }