List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration
public Configuration getConfiguration();
From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java
License:Apache License
protected List<FileStatus> listStatus(JobContext job) throws IOException { Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); }/* w w w. ja v a 2 s . c o m*/ // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); // Whether we need to recursive look into the directory structure boolean recursive = getInputDirRecursive(job); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); List<FileStatus> result = simpleListStatus(job, dirs, inputFilter, recursive); LOG.info("Total input paths to process : " + result.size()); return result; }
From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java
License:Apache License
private List<FileStatus> simpleListStatus(JobContext job, Path[] dirs, PathFilter inputFilter, boolean recursive) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); List<IOException> errors = new ArrayList<IOException>(); Configuration conf = job.getConfiguration(); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i];// w w w . ja v a 2 s. c om FileSystem fs = p.getFileSystem(conf); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDirectory()) { FileStatus[] files = fs.listStatus(globStat.getPath(), inputFilter); for (int j = 0; j < files.length; j++) { if (recursive && files[j].isDirectory()) { simpleAddInputPathRecursively(result, fs, files[j].getPath(), inputFilter); } else { result.add(files[j]); } } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } return result; }
From source file:com.marklogic.contentpump.MultithreadedMapper.java
License:Apache License
/** * The number of threads in the thread pool that will run the map function. * /* w w w. ja v a2 s. co m*/ * @param job * the job * @return the number of threads */ public static int getNumberOfThreads(JobContext job) { return job.getConfiguration().getInt(ConfigConstants.CONF_THREADS_PER_SPLIT, 10); }
From source file:com.marklogic.contentpump.MultithreadedMapper.java
License:Apache License
/** * Get the application's mapper class.//w w w . j av a2 s.co m * * @param <K1> * the map's input key type * @param <V1> * the map's input value type * @param <K2> * the map's output key type * @param <V2> * the map's output value type * @param job * the job * @return the mapper class to run */ @SuppressWarnings("unchecked") public static <K1, V1, K2, V2> Class<BaseMapper<K1, V1, K2, V2>> getMapperClass(JobContext job) { Configuration conf = job.getConfiguration(); return (Class<BaseMapper<K1, V1, K2, V2>>) conf.getClass(ConfigConstants.CONF_MULTITHREADEDMAPPER_CLASS, BaseMapper.class); }
From source file:com.marklogic.mapreduce.ForestInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); for (FileStatus file : files) { // stand directories Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConfiguration()); FileStatus children[] = fs.listStatus(path); FileStatus treeIndexStatus = null, treeDataStatus = null, ordinalsStatus = null, timestampsStatus = null; boolean obsolete = false; for (FileStatus child : children) { String fileName = child.getPath().getName(); if (fileName.equals("TreeData")) { // inside a stand treeDataStatus = child;//w ww . ja v a 2 s .c om } else if (fileName.equals("TreeIndex")) { treeIndexStatus = child; } else if (fileName.equals("Ordinals")) { ordinalsStatus = child; } else if (fileName.equals("Timestamps")) { timestampsStatus = child; } else if (fileName.equals("Obsolete")) { obsolete = true; break; } } if (obsolete) { LOG.warn("Obsolete file found. The forest is either live or isn't " + "dismounted cleanly. Ignoring stand " + path); break; } if (treeDataStatus == null) { throw new RuntimeException("TreeData file not found."); } else if (treeIndexStatus == null) { throw new RuntimeException("TreeIndex file not found."); } else if (ordinalsStatus == null) { throw new RuntimeException("Ordinals file not found."); } else if (timestampsStatus == null) { throw new RuntimeException("Timestamps file not found."); } long treeDataSize = treeDataStatus.getLen(); if (treeDataSize == 0) { // unexpected, give up this stand LOG.warn("Found empty TreeData file. Skipping..."); continue; // skipping this stand } Path treeDataPath = treeDataStatus.getPath(); long blockSize = treeDataStatus.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); // make splits based on TreeIndex FSDataInputStream is = fs.open(treeIndexStatus.getPath()); BiendianDataInputStream in = new BiendianDataInputStream(is); int prevDocid = -1, docid = -1, position = 0; long prevOffset = -1L, offset = 0, splitStart = 0; BlockLocation[] blkLocations = fs.getFileBlockLocations(treeDataStatus, 0, treeDataSize); try { for (;; ++position) { try { docid = in.readInt(); in.readInt(); offset = in.readLong(); } catch (EOFException e) { break; } int comp = InternalUtilities.compareUnsignedLong(offset, treeDataSize); if (comp > 0) { throw new RuntimeException("TreeIndex offset is out of bound: position = " + position + ", offset = " + offset + ", treeDataSize = " + treeDataSize); } if (prevDocid != -1 && (docid & 0xffffffffL) <= (prevDocid & 0xffffffffL)) { throw new RuntimeException("docid out of order, position = " + position + ", docid = " + docid + ", prevDocid = " + prevDocid); } prevDocid = docid; if (prevOffset != -1L && InternalUtilities.compareUnsignedLong(offset, prevOffset) <= 0) { throw new RuntimeException("offset out of order, position = " + position + ", offset = " + offset + ", prevOffset = " + prevOffset); } long splitLen = offset - splitStart; if (splitLen == splitSize || (splitLen > splitSize && splitLen - splitSize <= splitSize - (prevOffset - splitStart))) { int blkIndex = getBlockIndex(blkLocations, offset); InputSplit split = new FileSplit(treeDataPath, splitStart, splitLen, blkLocations[blkIndex].getHosts()); if (LOG.isDebugEnabled()) { LOG.debug("Created split: start=" + splitStart + " len=" + splitLen + " last docid=" + docid); } splits.add(split); splitStart = offset; } else if (splitLen > splitSize) { int blkIndex = getBlockIndex(blkLocations, prevOffset); InputSplit split = new FileSplit(treeDataPath, splitStart, prevOffset - splitStart, blkLocations[blkIndex].getHosts()); if (LOG.isDebugEnabled()) { LOG.debug("Created split: start=" + splitStart + " len=" + (prevOffset - splitStart) + " last docid=" + docid); } splits.add(split); splitStart = prevOffset; } } } finally { in.close(); } if (offset > splitStart) { int blkIndex = getBlockIndex(blkLocations, offset - 1); InputSplit split = new FileSplit(treeDataPath, splitStart, offset - splitStart, blkLocations[blkIndex].getHosts()); if (LOG.isDebugEnabled()) { LOG.debug("Created split: start=" + splitStart + " len=" + (offset - splitStart) + " last docid=" + docid); } splits.add(split); } } if (LOG.isDebugEnabled()) { LOG.debug("Made " + splits.size() + " splits."); } return splits; }
From source file:com.marklogic.mapreduce.MarkLogicInputFormat.java
License:Apache License
/** * Get input splits./*ww w .j a v a2 s . com*/ * @param jobContext job context * @return list of input splits */ @SuppressWarnings("unchecked") @Override public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException { // get input from job configuration jobConf = jobContext.getConfiguration(); boolean advancedMode = jobConf.get(INPUT_MODE, BASIC_MODE).equals(ADVANCED_MODE); String splitQuery; String queryLanguage = null; String[] redactionRuleCol = jobConf.getStrings(REDACTION_RULE_COLLECTION); if (advancedMode) { queryLanguage = jobConf.get(INPUT_QUERY_LANGUAGE); splitQuery = jobConf.get(SPLIT_QUERY); } else { StringBuilder buf = new StringBuilder(); buf.append("xquery version \"1.0-ml\";\n"); buf.append("import module namespace hadoop = "); buf.append("\"http://marklogic.com/xdmp/hadoop\" at "); buf.append("\"/MarkLogic/hadoop.xqy\";\n"); if (redactionRuleCol != null) { buf.append("import module namespace rdt = " + "\"http://marklogic.com/xdmp/redaction\" at " + "\"/MarkLogic/redaction.xqy\";\n"); } buf.append("xdmp:host-name(xdmp:host()),\n"); buf.append("hadoop:get-splits(\'"); appendNsBindings(buf); buf.append("\', \'"); appendDocumentSelector(buf); buf.append("\',"); appendQuery(buf); buf.append("),\n"); appendRedactionRuleValidateQuery(buf, redactionRuleCol); buf.append(",0,"); appendCustom(buf); splitQuery = buf.toString(); } String mode = jobConf.get(EXECUTION_MODE, MODE_DISTRIBUTED); long defaultSplitSize = mode.equals(MODE_DISTRIBUTED) ? DEFAULT_MAX_SPLIT_SIZE : DEFAULT_LOCAL_MAX_SPLIT_SIZE; long maxSplitSize = jobConf.getLong(MAX_SPLIT_SIZE, defaultSplitSize); if (maxSplitSize <= 0) { throw new IllegalStateException( "Max split size is required to be positive. It is set to " + maxSplitSize); } // fetch data from server List<ForestSplit> forestSplits = new ArrayList<ForestSplit>(); Session session = null; ResultSequence result = null; if (LOG.isDebugEnabled()) { LOG.debug("Split query: " + splitQuery); } localMode = MODE_LOCAL.equals(jobConf.get(EXECUTION_MODE)); try { ContentSource cs = InternalUtilities.getInputContentSource(jobConf); session = cs.newSession(); RequestOptions options = new RequestOptions(); options.setCacheResult(false); if (localMode && advancedMode) { AdhocQuery hostQuery = session .newAdhocQuery("xquery version \"1.0-ml\";xdmp:host-name(xdmp:host())"); hostQuery.setOptions(options); result = session.submitRequest(hostQuery); if (result.hasNext()) { ResultItem item = result.next(); localHost = item.asString(); } if (result != null) { result.close(); } } AdhocQuery query = session.newAdhocQuery(splitQuery); if (queryLanguage != null) { InternalUtilities.checkQueryLanguage(queryLanguage); options.setQueryLanguage(queryLanguage); } query.setOptions(options); result = session.submitRequest(query); if (!advancedMode && result.hasNext()) { ResultItem item = result.next(); localHost = item.asString(); } List<String> ruleUris = null; if (redactionRuleCol != null) { ruleUris = new ArrayList<String>(); } getForestSplits(jobContext, result, forestSplits, ruleUris); LOG.info("Fetched " + forestSplits.size() + " forest splits."); } catch (XccConfigException e) { LOG.error(e); throw new IOException(e); } catch (RequestException e) { LOG.error(e); LOG.error("Query: " + splitQuery); throw new IOException(e); } catch (URISyntaxException e) { LOG.error(e); throw new IOException(e); } finally { if (result != null) { result.close(); } if (session != null) { session.close(); } } // create a split list per forest per host if (forestSplits == null || forestSplits.isEmpty()) { return new ArrayList<InputSplit>(); } // construct a list of splits per forest per host Map<String, List<List<InputSplit>>> hostForestSplits = new HashMap<String, List<List<InputSplit>>>(); boolean tsQuery = (jobConf.get(INPUT_QUERY_TIMESTAMP) != null); for (int i = 0; i < forestSplits.size(); i++) { ForestSplit fsplit = forestSplits.get(i); List<InputSplit> splits = null; if (fsplit.recordCount > 0 || !tsQuery) { String host = fsplit.hostName; List<List<InputSplit>> splitLists = hostForestSplits.get(host); if (splitLists == null) { splitLists = new ArrayList<List<InputSplit>>(); hostForestSplits.put(host, splitLists); } splits = new ArrayList<InputSplit>(); splitLists.add(splits); } else { continue; } if (fsplit.recordCount < maxSplitSize) { MarkLogicInputSplit split = new MarkLogicInputSplit(0, fsplit.recordCount, fsplit.forestId, fsplit.hostName); split.setLastSplit(true); splits.add(split); if (LOG.isDebugEnabled()) { LOG.debug("Added split " + split); } } else { long splitCount = fsplit.recordCount / maxSplitSize; long remainder = fsplit.recordCount % maxSplitSize; if (remainder != 0) { splitCount++; } long splitSize = fsplit.recordCount / splitCount; remainder = fsplit.recordCount % splitCount; if (remainder != 0) { splitSize++; } if (this instanceof KeyValueInputFormat<?, ?>) { // each split size has to be an even number if ((splitSize & 0x1) != 0) { splitSize++; } } long remainingCount = fsplit.recordCount; while (remainingCount > 0) { long start = fsplit.recordCount - remainingCount; long length = splitSize; MarkLogicInputSplit split = new MarkLogicInputSplit(start, length, fsplit.forestId, fsplit.hostName); if (remainingCount <= maxSplitSize) { split.setLastSplit(true); } splits.add(split); remainingCount -= length; if (LOG.isDebugEnabled()) { LOG.debug("Added split " + split); } } } } // mix the lists of splits into one per host Set<String> hosts = hostForestSplits.keySet(); int hostCount = hosts.size(); List<InputSplit>[] hostSplits = (List<InputSplit>[]) new List<?>[hostCount]; int i = 0; for (String host : hosts) { List<List<InputSplit>> splitLists = hostForestSplits.get(host); if (splitLists.size() == 1) { hostSplits[i++] = splitLists.get(0); } else { hostSplits[i] = new ArrayList<InputSplit>(); boolean more = true; for (int j = 0; more; j++) { more = false; for (List<InputSplit> splitsPerForest : splitLists) { if (j < splitsPerForest.size()) { hostSplits[i].add(splitsPerForest.get(j)); } more = more || (j + 1 < splitsPerForest.size()); } } i++; } } // mix hostSplits into one List<InputSplit> splitList = new ArrayList<InputSplit>(); boolean more = true; for (int j = 0; more; j++) { more = false; for (List<InputSplit> splitsPerHost : hostSplits) { if (j < splitsPerHost.size()) { splitList.add(splitsPerHost.get(j)); } more = more || (j + 1 < splitsPerHost.size()); } } LOG.info("Made " + splitList.size() + " splits."); if (LOG.isDebugEnabled()) { for (InputSplit split : splitList) { LOG.debug(split); } } return splitList; }
From source file:com.metamx.druid.indexer.Utils.java
License:Open Source License
public static OutputStream makePathAndOutputStream(JobContext job, Path outputPath, boolean deleteExisting) throws IOException { OutputStream retVal;/* w ww . j av a 2 s. c o m*/ FileSystem fs = outputPath.getFileSystem(job.getConfiguration()); if (fs.exists(outputPath)) { if (deleteExisting) { fs.delete(outputPath, false); } else { throw new ISE("outputPath[%s] must not exist.", outputPath); } } if (!FileOutputFormat.getCompressOutput(job)) { retVal = fs.create(outputPath, false); } else { Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(job, GzipCodec.class); CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job.getConfiguration()); outputPath = new Path(outputPath.toString() + codec.getDefaultExtension()); retVal = codec.createOutputStream(fs.create(outputPath, false)); } return retVal; }
From source file:com.metamx.druid.indexer.Utils.java
License:Open Source License
public static InputStream openInputStream(JobContext job, Path inputPath) throws IOException { return openInputStream(inputPath, inputPath.getFileSystem(job.getConfiguration())); }
From source file:com.metamx.druid.indexer.Utils.java
License:Open Source License
public static Map<String, Object> getStats(JobContext job, Path statsPath) throws IOException { FileSystem fs = statsPath.getFileSystem(job.getConfiguration()); return jsonMapper.readValue(fs.open(statsPath), new TypeReference<Map<String, Object>>() { });/*from w w w .java 2s. co m*/ }
From source file:com.mongodb.hadoop.BSONFileInputFormat.java
License:Apache License
public static PathFilter getInputPathFilter(final JobContext context) { Configuration conf = context.getConfiguration(); Class<?> filterClass = conf.getClass("bson.pathfilter.class", null, PathFilter.class); return filterClass != null ? (PathFilter) ReflectionUtils.newInstance(filterClass, conf) : null; }