Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java

License:Apache License

protected List<FileStatus> listStatus(JobContext job) throws IOException {
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }/* w  w  w. ja  v  a 2  s  . c o m*/

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    // Whether we need to recursive look into the directory structure
    boolean recursive = getInputDirRecursive(job);

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    List<FileStatus> result = simpleListStatus(job, dirs, inputFilter, recursive);

    LOG.info("Total input paths to process : " + result.size());
    return result;
}

From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java

License:Apache License

private List<FileStatus> simpleListStatus(JobContext job, Path[] dirs, PathFilter inputFilter,
        boolean recursive) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();
    Configuration conf = job.getConfiguration();
    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];//  w w w . ja  v a 2  s.  c  om
        FileSystem fs = p.getFileSystem(conf);
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDirectory()) {
                    FileStatus[] files = fs.listStatus(globStat.getPath(), inputFilter);
                    for (int j = 0; j < files.length; j++) {
                        if (recursive && files[j].isDirectory()) {
                            simpleAddInputPathRecursively(result, fs, files[j].getPath(), inputFilter);
                        } else {
                            result.add(files[j]);
                        }
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    return result;
}

From source file:com.marklogic.contentpump.MultithreadedMapper.java

License:Apache License

/**
  * The number of threads in the thread pool that will run the map function.
  * /*  w  w  w. ja  v a2 s. co  m*/
  * @param job
  *            the job
  * @return the number of threads
  */
public static int getNumberOfThreads(JobContext job) {
    return job.getConfiguration().getInt(ConfigConstants.CONF_THREADS_PER_SPLIT, 10);
}

From source file:com.marklogic.contentpump.MultithreadedMapper.java

License:Apache License

/**
 * Get the application's mapper class.//w w  w .  j av  a2 s.co  m
 * 
 * @param <K1>
 *            the map's input key type
 * @param <V1>
 *            the map's input value type
 * @param <K2>
 *            the map's output key type
 * @param <V2>
 *            the map's output value type
 * @param job
 *            the job
 * @return the mapper class to run
 */
@SuppressWarnings("unchecked")
public static <K1, V1, K2, V2> Class<BaseMapper<K1, V1, K2, V2>> getMapperClass(JobContext job) {
    Configuration conf = job.getConfiguration();
    return (Class<BaseMapper<K1, V1, K2, V2>>) conf.getClass(ConfigConstants.CONF_MULTITHREADEDMAPPER_CLASS,
            BaseMapper.class);
}

From source file:com.marklogic.mapreduce.ForestInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) { // stand directories
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(job.getConfiguration());
        FileStatus children[] = fs.listStatus(path);
        FileStatus treeIndexStatus = null, treeDataStatus = null, ordinalsStatus = null,
                timestampsStatus = null;
        boolean obsolete = false;
        for (FileStatus child : children) {
            String fileName = child.getPath().getName();
            if (fileName.equals("TreeData")) { // inside a stand
                treeDataStatus = child;//w ww .  ja  v a 2  s  .c om
            } else if (fileName.equals("TreeIndex")) {
                treeIndexStatus = child;
            } else if (fileName.equals("Ordinals")) {
                ordinalsStatus = child;
            } else if (fileName.equals("Timestamps")) {
                timestampsStatus = child;
            } else if (fileName.equals("Obsolete")) {
                obsolete = true;
                break;
            }
        }
        if (obsolete) {
            LOG.warn("Obsolete file found.  The forest is either live or isn't "
                    + "dismounted cleanly.  Ignoring stand " + path);
            break;
        }
        if (treeDataStatus == null) {
            throw new RuntimeException("TreeData file not found.");
        } else if (treeIndexStatus == null) {
            throw new RuntimeException("TreeIndex file not found.");
        } else if (ordinalsStatus == null) {
            throw new RuntimeException("Ordinals file not found.");
        } else if (timestampsStatus == null) {
            throw new RuntimeException("Timestamps file not found.");
        }
        long treeDataSize = treeDataStatus.getLen();
        if (treeDataSize == 0) {
            // unexpected, give up this stand
            LOG.warn("Found empty TreeData file.  Skipping...");
            continue; // skipping this stand
        }
        Path treeDataPath = treeDataStatus.getPath();
        long blockSize = treeDataStatus.getBlockSize();
        long splitSize = computeSplitSize(blockSize, minSize, maxSize);
        // make splits based on TreeIndex
        FSDataInputStream is = fs.open(treeIndexStatus.getPath());
        BiendianDataInputStream in = new BiendianDataInputStream(is);
        int prevDocid = -1, docid = -1, position = 0;
        long prevOffset = -1L, offset = 0, splitStart = 0;
        BlockLocation[] blkLocations = fs.getFileBlockLocations(treeDataStatus, 0, treeDataSize);
        try {
            for (;; ++position) {
                try {
                    docid = in.readInt();
                    in.readInt();
                    offset = in.readLong();
                } catch (EOFException e) {
                    break;
                }
                int comp = InternalUtilities.compareUnsignedLong(offset, treeDataSize);
                if (comp > 0) {
                    throw new RuntimeException("TreeIndex offset is out of bound: position = " + position
                            + ", offset = " + offset + ", treeDataSize = " + treeDataSize);
                }
                if (prevDocid != -1 && (docid & 0xffffffffL) <= (prevDocid & 0xffffffffL)) {
                    throw new RuntimeException("docid out of order, position = " + position + ", docid = "
                            + docid + ", prevDocid = " + prevDocid);
                }
                prevDocid = docid;
                if (prevOffset != -1L && InternalUtilities.compareUnsignedLong(offset, prevOffset) <= 0) {
                    throw new RuntimeException("offset out of order, position = " + position + ", offset = "
                            + offset + ", prevOffset = " + prevOffset);
                }
                long splitLen = offset - splitStart;
                if (splitLen == splitSize || (splitLen > splitSize
                        && splitLen - splitSize <= splitSize - (prevOffset - splitStart))) {
                    int blkIndex = getBlockIndex(blkLocations, offset);
                    InputSplit split = new FileSplit(treeDataPath, splitStart, splitLen,
                            blkLocations[blkIndex].getHosts());
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Created split: start=" + splitStart + " len=" + splitLen + " last docid="
                                + docid);
                    }
                    splits.add(split);
                    splitStart = offset;
                } else if (splitLen > splitSize) {
                    int blkIndex = getBlockIndex(blkLocations, prevOffset);
                    InputSplit split = new FileSplit(treeDataPath, splitStart, prevOffset - splitStart,
                            blkLocations[blkIndex].getHosts());
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Created split: start=" + splitStart + " len=" + (prevOffset - splitStart)
                                + " last docid=" + docid);
                    }
                    splits.add(split);
                    splitStart = prevOffset;
                }
            }
        } finally {
            in.close();
        }
        if (offset > splitStart) {
            int blkIndex = getBlockIndex(blkLocations, offset - 1);
            InputSplit split = new FileSplit(treeDataPath, splitStart, offset - splitStart,
                    blkLocations[blkIndex].getHosts());
            if (LOG.isDebugEnabled()) {
                LOG.debug("Created split: start=" + splitStart + " len=" + (offset - splitStart)
                        + " last docid=" + docid);
            }

            splits.add(split);
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Made " + splits.size() + " splits.");
    }

    return splits;
}

From source file:com.marklogic.mapreduce.MarkLogicInputFormat.java

License:Apache License

/**
 * Get input splits./*ww w  .j a v  a2  s .  com*/
 * @param jobContext job context
 * @return list of input splits    
 */
@SuppressWarnings("unchecked")
@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    // get input from job configuration
    jobConf = jobContext.getConfiguration();
    boolean advancedMode = jobConf.get(INPUT_MODE, BASIC_MODE).equals(ADVANCED_MODE);
    String splitQuery;
    String queryLanguage = null;
    String[] redactionRuleCol = jobConf.getStrings(REDACTION_RULE_COLLECTION);

    if (advancedMode) {
        queryLanguage = jobConf.get(INPUT_QUERY_LANGUAGE);
        splitQuery = jobConf.get(SPLIT_QUERY);
    } else {
        StringBuilder buf = new StringBuilder();
        buf.append("xquery version \"1.0-ml\";\n");
        buf.append("import module namespace hadoop = ");
        buf.append("\"http://marklogic.com/xdmp/hadoop\" at ");
        buf.append("\"/MarkLogic/hadoop.xqy\";\n");
        if (redactionRuleCol != null) {
            buf.append("import module namespace rdt = " + "\"http://marklogic.com/xdmp/redaction\" at "
                    + "\"/MarkLogic/redaction.xqy\";\n");
        }
        buf.append("xdmp:host-name(xdmp:host()),\n");
        buf.append("hadoop:get-splits(\'");
        appendNsBindings(buf);
        buf.append("\', \'");
        appendDocumentSelector(buf);
        buf.append("\',");
        appendQuery(buf);
        buf.append("),\n");
        appendRedactionRuleValidateQuery(buf, redactionRuleCol);
        buf.append(",0,");
        appendCustom(buf);
        splitQuery = buf.toString();
    }

    String mode = jobConf.get(EXECUTION_MODE, MODE_DISTRIBUTED);
    long defaultSplitSize = mode.equals(MODE_DISTRIBUTED) ? DEFAULT_MAX_SPLIT_SIZE
            : DEFAULT_LOCAL_MAX_SPLIT_SIZE;
    long maxSplitSize = jobConf.getLong(MAX_SPLIT_SIZE, defaultSplitSize);
    if (maxSplitSize <= 0) {
        throw new IllegalStateException(
                "Max split size is required to be positive. It is set to " + maxSplitSize);
    }

    // fetch data from server
    List<ForestSplit> forestSplits = new ArrayList<ForestSplit>();
    Session session = null;
    ResultSequence result = null;

    if (LOG.isDebugEnabled()) {
        LOG.debug("Split query: " + splitQuery);
    }
    localMode = MODE_LOCAL.equals(jobConf.get(EXECUTION_MODE));
    try {
        ContentSource cs = InternalUtilities.getInputContentSource(jobConf);
        session = cs.newSession();
        RequestOptions options = new RequestOptions();
        options.setCacheResult(false);

        if (localMode && advancedMode) {
            AdhocQuery hostQuery = session
                    .newAdhocQuery("xquery version \"1.0-ml\";xdmp:host-name(xdmp:host())");
            hostQuery.setOptions(options);
            result = session.submitRequest(hostQuery);
            if (result.hasNext()) {
                ResultItem item = result.next();
                localHost = item.asString();
            }
            if (result != null) {
                result.close();
            }
        }

        AdhocQuery query = session.newAdhocQuery(splitQuery);
        if (queryLanguage != null) {
            InternalUtilities.checkQueryLanguage(queryLanguage);
            options.setQueryLanguage(queryLanguage);
        }
        query.setOptions(options);
        result = session.submitRequest(query);

        if (!advancedMode && result.hasNext()) {
            ResultItem item = result.next();
            localHost = item.asString();
        }
        List<String> ruleUris = null;
        if (redactionRuleCol != null) {
            ruleUris = new ArrayList<String>();
        }
        getForestSplits(jobContext, result, forestSplits, ruleUris);
        LOG.info("Fetched " + forestSplits.size() + " forest splits.");
    } catch (XccConfigException e) {
        LOG.error(e);
        throw new IOException(e);
    } catch (RequestException e) {
        LOG.error(e);
        LOG.error("Query: " + splitQuery);
        throw new IOException(e);
    } catch (URISyntaxException e) {
        LOG.error(e);
        throw new IOException(e);
    } finally {
        if (result != null) {
            result.close();
        }
        if (session != null) {
            session.close();
        }
    }

    // create a split list per forest per host
    if (forestSplits == null || forestSplits.isEmpty()) {
        return new ArrayList<InputSplit>();
    }

    // construct a list of splits per forest per host
    Map<String, List<List<InputSplit>>> hostForestSplits = new HashMap<String, List<List<InputSplit>>>();
    boolean tsQuery = (jobConf.get(INPUT_QUERY_TIMESTAMP) != null);
    for (int i = 0; i < forestSplits.size(); i++) {
        ForestSplit fsplit = forestSplits.get(i);
        List<InputSplit> splits = null;
        if (fsplit.recordCount > 0 || !tsQuery) {
            String host = fsplit.hostName;
            List<List<InputSplit>> splitLists = hostForestSplits.get(host);
            if (splitLists == null) {
                splitLists = new ArrayList<List<InputSplit>>();
                hostForestSplits.put(host, splitLists);
            }
            splits = new ArrayList<InputSplit>();
            splitLists.add(splits);
        } else {
            continue;
        }
        if (fsplit.recordCount < maxSplitSize) {
            MarkLogicInputSplit split = new MarkLogicInputSplit(0, fsplit.recordCount, fsplit.forestId,
                    fsplit.hostName);
            split.setLastSplit(true);
            splits.add(split);
            if (LOG.isDebugEnabled()) {
                LOG.debug("Added split " + split);
            }
        } else {
            long splitCount = fsplit.recordCount / maxSplitSize;
            long remainder = fsplit.recordCount % maxSplitSize;
            if (remainder != 0) {
                splitCount++;
            }
            long splitSize = fsplit.recordCount / splitCount;
            remainder = fsplit.recordCount % splitCount;
            if (remainder != 0) {
                splitSize++;
            }
            if (this instanceof KeyValueInputFormat<?, ?>) {
                // each split size has to be an even number
                if ((splitSize & 0x1) != 0) {
                    splitSize++;
                }
            }
            long remainingCount = fsplit.recordCount;
            while (remainingCount > 0) {
                long start = fsplit.recordCount - remainingCount;
                long length = splitSize;
                MarkLogicInputSplit split = new MarkLogicInputSplit(start, length, fsplit.forestId,
                        fsplit.hostName);
                if (remainingCount <= maxSplitSize) {
                    split.setLastSplit(true);
                }
                splits.add(split);
                remainingCount -= length;
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Added split " + split);
                }
            }
        }
    }

    // mix the lists of splits into one per host
    Set<String> hosts = hostForestSplits.keySet();
    int hostCount = hosts.size();
    List<InputSplit>[] hostSplits = (List<InputSplit>[]) new List<?>[hostCount];
    int i = 0;
    for (String host : hosts) {
        List<List<InputSplit>> splitLists = hostForestSplits.get(host);
        if (splitLists.size() == 1) {
            hostSplits[i++] = splitLists.get(0);
        } else {
            hostSplits[i] = new ArrayList<InputSplit>();
            boolean more = true;
            for (int j = 0; more; j++) {
                more = false;
                for (List<InputSplit> splitsPerForest : splitLists) {
                    if (j < splitsPerForest.size()) {
                        hostSplits[i].add(splitsPerForest.get(j));
                    }
                    more = more || (j + 1 < splitsPerForest.size());
                }
            }
            i++;
        }
    }

    // mix hostSplits into one
    List<InputSplit> splitList = new ArrayList<InputSplit>();
    boolean more = true;
    for (int j = 0; more; j++) {
        more = false;
        for (List<InputSplit> splitsPerHost : hostSplits) {
            if (j < splitsPerHost.size()) {
                splitList.add(splitsPerHost.get(j));
            }
            more = more || (j + 1 < splitsPerHost.size());
        }
    }

    LOG.info("Made " + splitList.size() + " splits.");
    if (LOG.isDebugEnabled()) {
        for (InputSplit split : splitList) {
            LOG.debug(split);
        }
    }
    return splitList;
}

From source file:com.metamx.druid.indexer.Utils.java

License:Open Source License

public static OutputStream makePathAndOutputStream(JobContext job, Path outputPath, boolean deleteExisting)
        throws IOException {
    OutputStream retVal;/* w ww  . j av a 2  s. c  o  m*/
    FileSystem fs = outputPath.getFileSystem(job.getConfiguration());

    if (fs.exists(outputPath)) {
        if (deleteExisting) {
            fs.delete(outputPath, false);
        } else {
            throw new ISE("outputPath[%s] must not exist.", outputPath);
        }
    }

    if (!FileOutputFormat.getCompressOutput(job)) {
        retVal = fs.create(outputPath, false);
    } else {
        Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(job,
                GzipCodec.class);
        CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job.getConfiguration());
        outputPath = new Path(outputPath.toString() + codec.getDefaultExtension());

        retVal = codec.createOutputStream(fs.create(outputPath, false));
    }

    return retVal;
}

From source file:com.metamx.druid.indexer.Utils.java

License:Open Source License

public static InputStream openInputStream(JobContext job, Path inputPath) throws IOException {
    return openInputStream(inputPath, inputPath.getFileSystem(job.getConfiguration()));
}

From source file:com.metamx.druid.indexer.Utils.java

License:Open Source License

public static Map<String, Object> getStats(JobContext job, Path statsPath) throws IOException {
    FileSystem fs = statsPath.getFileSystem(job.getConfiguration());

    return jsonMapper.readValue(fs.open(statsPath), new TypeReference<Map<String, Object>>() {
    });/*from  w  w w .java 2s. co m*/
}

From source file:com.mongodb.hadoop.BSONFileInputFormat.java

License:Apache License

public static PathFilter getInputPathFilter(final JobContext context) {
    Configuration conf = context.getConfiguration();
    Class<?> filterClass = conf.getClass("bson.pathfilter.class", null, PathFilter.class);
    return filterClass != null ? (PathFilter) ReflectionUtils.newInstance(filterClass, conf) : null;
}