Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java

License:Apache License

protected List<FileStatus> listStatus(JobContext job) throws IOException {
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }/* w  w  w. ja  v  a 2  s  . c o m*/

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    // Whether we need to recursive look into the directory structure
    boolean recursive = getInputDirRecursive(job);

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    List<FileStatus> result = simpleListStatus(job, dirs, inputFilter, recursive);

    LOG.info("Total input paths to process : " + result.size());
    return result;
}

From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java

License:Apache License

private List<FileStatus> simpleListStatus(JobContext job, Path[] dirs, PathFilter inputFilter,
        boolean recursive) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();
    Configuration conf = job.getConfiguration();
    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];//  w w w . ja  v a 2  s.  c  om
        FileSystem fs = p.getFileSystem(conf);
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDirectory()) {
                    FileStatus[] files = fs.listStatus(globStat.getPath(), inputFilter);
                    for (int j = 0; j < files.length; j++) {
                        if (recursive && files[j].isDirectory()) {
                            simpleAddInputPathRecursively(result, fs, files[j].getPath(), inputFilter);
                        } else {
                            result.add(files[j]);
                        }
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    return result;
}

From source file:com.marklogic.contentpump.MultithreadedMapper.java

License:Apache License

/**
  * The number of threads in the thread pool that will run the map function.
  * /*  w  w  w. ja  v a2 s. co  m*/
  * @param job
  *            the job
  * @return the number of threads
  */
public static int getNumberOfThreads(JobContext job) {
    return job.getConfiguration().getInt(ConfigConstants.CONF_THREADS_PER_SPLIT, 10);
}

From source file:com.marklogic.contentpump.MultithreadedMapper.java

License:Apache License

/**
 * Get the application's mapper class.//w w  w .  j av  a2 s.co  m
 * 
 * @param <K1>
 *            the map's input key type
 * @param <V1>
 *            the map's input value type
 * @param <K2>
 *            the map's output key type
 * @param <V2>
 *            the map's output value type
 * @param job
 *            the job
 * @return the mapper class to run
 */
@SuppressWarnings("unchecked")
public static <K1, V1, K2, V2> Class<BaseMapper<K1, V1, K2, V2>> getMapperClass(JobContext job) {
    Configuration conf = job.getConfiguration();
    return (Class<BaseMapper<K1, V1, K2, V2>>) conf.getClass(ConfigConstants.CONF_MULTITHREADEDMAPPER_CLASS,
            BaseMapper.class);
}

From source file:com.marklogic.mapreduce.ForestInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) { // stand directories
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        FileStatus children[] = fs.listStatus(path);
        FileStatus treeIndexStatus = null, treeDataStatus = null, ordinalsStatus = null,
                timestampsStatus = null;
        boolean obsolete = false;
        for (FileStatus child : children) {
            String fileName = child.getPath().getName();
            if (fileName.equals("TreeData")) { // inside a stand
                treeDataStatus = child;//w ww .  ja  v a 2  s  .c om
            } else if (fileName.equals("TreeIndex")) {
                treeIndexStatus = child;
            } else if (fileName.equals("Ordinals")) {
                ordinalsStatus = child;
            } else if (fileName.equals("Timestamps")) {
                timestampsStatus = child;
            } else if (fileName.equals("Obsolete")) {
                obsolete = true;
                break;
            }
        }
        if (obsolete) {
            LOG.warn("Obsolete file found.  The forest is either live or isn't "
                    + "dismounted cleanly.  Ignoring stand " + path);
            break;
        }
        if (treeDataStatus == null) {
            throw new RuntimeException("TreeData file not found.");
        } else if (treeIndexStatus == null) {
            throw new RuntimeException("TreeIndex file not found.");
        } else if (ordinalsStatus == null) {
            throw new RuntimeException("Ordinals file not found.");
        } else if (timestampsStatus == null) {
            throw new RuntimeException("Timestamps file not found.");
        }
        long treeDataSize = treeDataStatus.getLen();
        if (treeDataSize == 0) {
            // unexpected, give up this stand
            LOG.warn("Found empty TreeData file.  Skipping...");
            continue; // skipping this stand
        }
        Path treeDataPath = treeDataStatus.getPath();
        long blockSize = treeDataStatus.getBlockSize();
        long splitSize = computeSplitSize(blockSize, minSize, maxSize);
        // make splits based on TreeIndex
        FSDataInputStream is = fs.open(treeIndexStatus.getPath());
        BiendianDataInputStream in = new BiendianDataInputStream(is);
        int prevDocid = -1, docid = -1, position = 0;
        long prevOffset = -1L, offset = 0, splitStart = 0;
        BlockLocation[] blkLocations = fs.getFileBlockLocations(treeDataStatus, 0, treeDataSize);
        try {
            for (;; ++position) {
                try {
                    docid = in.readInt();
                    in.readInt();
                    offset = in.readLong();
                } catch (EOFException e) {
                    break;
                }
                int comp = InternalUtilities.compareUnsignedLong(offset, treeDataSize);
                if (comp > 0) {
                    throw new RuntimeException("TreeIndex offset is out of bound: position = " + position
                            + ", offset = " + offset + ", treeDataSize = " + treeDataSize);
                }
                if (prevDocid != -1 && (docid & 0xffffffffL) <= (prevDocid & 0xffffffffL)) {
                    throw new RuntimeException("docid out of order, position = " + position + ", docid = "
                            + docid + ", prevDocid = " + prevDocid);
                }
                prevDocid = docid;
                if (prevOffset != -1L && InternalUtilities.compareUnsignedLong(offset, prevOffset) <= 0) {
                    throw new RuntimeException("offset out of order, position = " + position + ", offset = "
                            + offset + ", prevOffset = " + prevOffset);
                }
                long splitLen = offset - splitStart;
                if (splitLen == splitSize || (splitLen > splitSize
                        && splitLen - splitSize <= splitSize - (prevOffset - splitStart))) {
                    int blkIndex = getBlockIndex(blkLocations, offset);
                    InputSplit split = new FileSplit(treeDataPath, splitStart, splitLen,
                            blkLocations[blkIndex].getHosts());
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Created split: start=" + splitStart + " len=" + splitLen + " last docid="
                                + docid);
                    }
                    splits.add(split);
                    splitStart = offset;
                } else if (splitLen > splitSize) {
                    int blkIndex = getBlockIndex(blkLocations, prevOffset);
                    InputSplit split = new FileSplit(treeDataPath, splitStart, prevOffset - splitStart,
                            blkLocations[blkIndex].getHosts());
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Created split: start=" + splitStart + " len=" + (prevOffset - splitStart)
                                + " last docid=" + docid);
                    }
                    splits.add(split);
                    splitStart = prevOffset;
                }
            }
        } finally {
            in.close();
        }
        if (offset > splitStart) {
            int blkIndex = getBlockIndex(blkLocations, offset - 1);
            InputSplit split = new FileSplit(treeDataPath, splitStart, offset - splitStart,
                    blkLocations[blkIndex].getHosts());
            if (LOG.isDebugEnabled()) {
                LOG.debug("Created split: start=" + splitStart + " len=" + (offset - splitStart)
                        + " last docid=" + docid);
            }

            splits.add(split);
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Made " + splits.size() + " splits.");
    }

    return splits;
}

From source file:com.marklogic.mapreduce.MarkLogicInputFormat.java

License:Apache License

/**
 * Get input splits./*ww w  .j a v  a2  s .  com*/
 * @param jobContext job context
 * @return list of input splits    
 */
@SuppressWarnings("unchecked")
@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    // get input from job configuration
    jobConf = jobContext.getConfiguration();
    boolean advancedMode = jobConf.get(INPUT_MODE, BASIC_MODE).equals(ADVANCED_MODE);
    String splitQuery;
    String queryLanguage = null;
    String[] redactionRuleCol = jobConf.getStrings(REDACTION_RULE_COLLECTION);

    if (advancedMode) {
        queryLanguage = jobConf.get(INPUT_QUERY_LANGUAGE);
        splitQuery = jobConf.get(SPLIT_QUERY);
    } else {
        StringBuilder buf = new StringBuilder();
        buf.append("xquery version \"1.0-ml\";\n");
        buf.append("import module namespace hadoop = ");
        buf.append("\"http://marklogic.com/xdmp/hadoop\" at ");
        buf.append("\"/MarkLogic/hadoop.xqy\";\n");
        if (redactionRuleCol != null) {
            buf.append("import module namespace rdt = " + "\"http://marklogic.com/xdmp/redaction\" at "
                    + "\"/MarkLogic/redaction.xqy\";\n");
        }
        buf.append("xdmp:host-name(xdmp:host()),\n");
        buf.append("hadoop:get-splits(\'");
        appendNsBindings(buf);
        buf.append("\', \'");
        appendDocumentSelector(buf);
        buf.append("\',");
        appendQuery(buf);
        buf.append("),\n");
        appendRedactionRuleValidateQuery(buf, redactionRuleCol);
        buf.append(",0,");
        appendCustom(buf);
        splitQuery = buf.toString();
    }

    String mode = jobConf.get(EXECUTION_MODE, MODE_DISTRIBUTED);
    long defaultSplitSize = mode.equals(MODE_DISTRIBUTED) ? DEFAULT_MAX_SPLIT_SIZE
            : DEFAULT_LOCAL_MAX_SPLIT_SIZE;
    long maxSplitSize = jobConf.getLong(MAX_SPLIT_SIZE, defaultSplitSize);
    if (maxSplitSize <= 0) {
        throw new IllegalStateException(
                "Max split size is required to be positive. It is set to " + maxSplitSize);
    }

    // fetch data from server
    List<ForestSplit> forestSplits = new ArrayList<ForestSplit>();
    Session session = null;
    ResultSequence result = null;

    if (LOG.isDebugEnabled()) {
        LOG.debug("Split query: " + splitQuery);
    }
    localMode = MODE_LOCAL.equals(jobConf.get(EXECUTION_MODE));
    try {
        ContentSource cs = InternalUtilities.getInputContentSource(jobConf);
        session = cs.newSession();
        RequestOptions options = new RequestOptions();
        options.setCacheResult(false);

        if (localMode && advancedMode) {
            AdhocQuery hostQuery = session
                    .newAdhocQuery("xquery version \"1.0-ml\";xdmp:host-name(xdmp:host())");
            hostQuery.setOptions(options);
            result = session.submitRequest(hostQuery);
            if (result.hasNext()) {
                ResultItem item = result.next();
                localHost = item.asString();
            }
            if (result != null) {
                result.close();
            }
        }

        AdhocQuery query = session.newAdhocQuery(splitQuery);
        if (queryLanguage != null) {
            InternalUtilities.checkQueryLanguage(queryLanguage);
            options.setQueryLanguage(queryLanguage);
        }
        query.setOptions(options);
        result = session.submitRequest(query);

        if (!advancedMode && result.hasNext()) {
            ResultItem item = result.next();
            localHost = item.asString();
        }
        List<String> ruleUris = null;
        if (redactionRuleCol != null) {
            ruleUris = new ArrayList<String>();
        }
        getForestSplits(jobContext, result, forestSplits, ruleUris);
        LOG.info("Fetched " + forestSplits.size() + " forest splits.");
    } catch (XccConfigException e) {
        LOG.error(e);
        throw new IOException(e);
    } catch (RequestException e) {
        LOG.error(e);
        LOG.error("Query: " + splitQuery);
        throw new IOException(e);
    } catch (URISyntaxException e) {
        LOG.error(e);
        throw new IOException(e);
    } finally {
        if (result != null) {
            result.close();
        }
        if (session != null) {
            session.close();
        }
    }

    // create a split list per forest per host
    if (forestSplits == null || forestSplits.isEmpty()) {
        return new ArrayList<InputSplit>();
    }

    // construct a list of splits per forest per host
    Map<String, List<List<InputSplit>>> hostForestSplits = new HashMap<String, List<List<InputSplit>>>();
    boolean tsQuery = (jobConf.get(INPUT_QUERY_TIMESTAMP) != null);
    for (int i = 0; i < forestSplits.size(); i++) {
        ForestSplit fsplit = forestSplits.get(i);
        List<InputSplit> splits = null;
        if (fsplit.recordCount > 0 || !tsQuery) {
            String host = fsplit.hostName;
            List<List<InputSplit>> splitLists = hostForestSplits.get(host);
            if (splitLists == null) {
                splitLists = new ArrayList<List<InputSplit>>();
                hostForestSplits.put(host, splitLists);
            }
            splits = new ArrayList<InputSplit>();
            splitLists.add(splits);
        } else {
            continue;
        }
        if (fsplit.recordCount < maxSplitSize) {
            MarkLogicInputSplit split = new MarkLogicInputSplit(0, fsplit.recordCount, fsplit.forestId,
                    fsplit.hostName);
            split.setLastSplit(true);
            splits.add(split);
            if (LOG.isDebugEnabled()) {
                LOG.debug("Added split " + split);
            }
        } else {
            long splitCount = fsplit.recordCount / maxSplitSize;
            long remainder = fsplit.recordCount % maxSplitSize;
            if (remainder != 0) {
                splitCount++;
            }
            long splitSize = fsplit.recordCount / splitCount;
            remainder = fsplit.recordCount % splitCount;
            if (remainder != 0) {
                splitSize++;
            }
            if (this instanceof KeyValueInputFormat<?, ?>) {
                // each split size has to be an even number
                if ((splitSize & 0x1) != 0) {
                    splitSize++;
                }
            }
            long remainingCount = fsplit.recordCount;
            while (remainingCount > 0) {
                long start = fsplit.recordCount - remainingCount;
                long length = splitSize;
                MarkLogicInputSplit split = new MarkLogicInputSplit(start, length, fsplit.forestId,
                        fsplit.hostName);
                if (remainingCount <= maxSplitSize) {
                    split.setLastSplit(true);
                }
                splits.add(split);
                remainingCount -= length;
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Added split " + split);
                }
            }
        }
    }

    // mix the lists of splits into one per host
    Set<String> hosts = hostForestSplits.keySet();
    int hostCount = hosts.size();
    List<InputSplit>[] hostSplits = (List<InputSplit>[]) new List<?>[hostCount];
    int i = 0;
    for (String host : hosts) {
        List<List<InputSplit>> splitLists = hostForestSplits.get(host);
        if (splitLists.size() == 1) {
            hostSplits[i++] = splitLists.get(0);
        } else {
            hostSplits[i] = new ArrayList<InputSplit>();
            boolean more = true;
            for (int j = 0; more; j++) {
                more = false;
                for (List<InputSplit> splitsPerForest : splitLists) {
                    if (j < splitsPerForest.size()) {
                        hostSplits[i].add(splitsPerForest.get(j));
                    }
                    more = more || (j + 1 < splitsPerForest.size());
                }
            }
            i++;
        }
    }

    // mix hostSplits into one
    List<InputSplit> splitList = new ArrayList<InputSplit>();
    boolean more = true;
    for (int j = 0; more; j++) {
        more = false;
        for (List<InputSplit> splitsPerHost : hostSplits) {
            if (j < splitsPerHost.size()) {
                splitList.add(splitsPerHost.get(j));
            }
            more = more || (j + 1 < splitsPerHost.size());
        }
    }

    LOG.info("Made " + splitList.size() + " splits.");
    if (LOG.isDebugEnabled()) {
        for (InputSplit split : splitList) {
            LOG.debug(split);
        }
    }
    return splitList;
}

From source file:com.metamx.druid.indexer.Utils.java

License:Open Source License

public static OutputStream makePathAndOutputStream(JobContext job, Path outputPath, boolean deleteExisting)
        throws IOException {
    OutputStream retVal;/* w ww  . j av a 2  s. c  o  m*/
    FileSystem fs = outputPath.getFileSystem(job.getConfiguration());

    if (fs.exists(outputPath)) {
        if (deleteExisting) {
            fs.delete(outputPath, false);
        } else {
            throw new ISE("outputPath[%s] must not exist.", outputPath);
        }
    }

    if (!FileOutputFormat.getCompressOutput(job)) {
        retVal = fs.create(outputPath, false);
    } else {
        Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(job,
                GzipCodec.class);
        CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job.getConfiguration());
        outputPath = new Path(outputPath.toString() + codec.getDefaultExtension());

        retVal = codec.createOutputStream(fs.create(outputPath, false));
    }

    return retVal;
}

From source file:com.metamx.druid.indexer.Utils.java

License:Open Source License

public static InputStream openInputStream(JobContext job, Path inputPath) throws IOException {
    return openInputStream(inputPath, inputPath.getFileSystem(job.getConfiguration()));
}

From source file:com.metamx.druid.indexer.Utils.java

License:Open Source License

public static Map<String, Object> getStats(JobContext job, Path statsPath) throws IOException {
    FileSystem fs = statsPath.getFileSystem(job.getConfiguration());

    return jsonMapper.readValue(fs.open(statsPath), new TypeReference<Map<String, Object>>() {
    });/*from  w  w w .java 2s. co m*/
}

From source file:com.mongodb.hadoop.BSONFileInputFormat.java

License:Apache License

public static PathFilter getInputPathFilter(final JobContext context) {
    Configuration conf = context.getConfiguration();
    Class<?> filterClass = conf.getClass("bson.pathfilter.class", null, PathFilter.class);
    return filterClass != null ? (PathFilter) ReflectionUtils.newInstance(filterClass, conf) : null;
}