Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.thinkbiganalytics.kylo.catalog.spark.sources.spark.HighWaterMarkInputFormat.java

License:Apache License

/**
 * Gets the high water mark value.//from  w w w.  j  a  v a 2 s .  c  o m
 */
public static long getHighWaterMark(@Nonnull final JobContext job) {
    return job.getConfiguration().getLong(HIGH_WATER_MARK, Long.MIN_VALUE);
}

From source file:com.thinkbiganalytics.kylo.catalog.spark.sources.spark.HighWaterMarkInputFormat.java

License:Apache License

/**
 * Gets the maximum file age.// w w w . j a va  2  s  .  co m
 */
public static long getMaxFileAge(@Nonnull final JobContext job) {
    return job.getConfiguration().getLong(MAX_FILE_AGE, Long.MAX_VALUE);
}

From source file:com.thinkbiganalytics.kylo.catalog.spark.sources.spark.HighWaterMarkInputFormat.java

License:Apache License

/**
 * Gets the minimum file age./*from   w  w  w.  ja v a2 s  .c  o  m*/
 */
public static long getMinFileAge(@Nonnull final JobContext job) {
    return job.getConfiguration().getLong(MIN_FILE_AGE, Long.MIN_VALUE);
}

From source file:com.topsoft.botspider.avro.mapreduce.output.ExtFileOutputFormat.java

License:Apache License

/**
 * Set the base output name for output file to be created.
 *///ww w .j a v a2s.  c o m
public static void setOutputName(JobContext job, String name) {
    job.getConfiguration().set(BASE_OUTPUT_NAME, name);
}

From source file:com.toshiba.mwcloud.gs.hadoop.mapreduce.GSRowInputFormat.java

License:Apache License

/**
 * <div lang="ja">//from w w w .  j av a2s  .  c o m
 * GridDB?InputSplit???????<br/>
 * InputSplit?????mapreduce.job.maps? ?????????
 * @param context JobContext
 * @throws GSException GridDB??????
 * </div><div lang="en">
 * Generate a list of GridDB InputSplit objects.<br/>
 * The number of InputSplits will be the smaller of the number of partitions for input processing and the value of property mapreduce.job.maps.
 * @param context JobContext object
 * @throws GSException an exception occurred in GridDB
 * </div>
 */
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {

    Configuration conf = context.getConfiguration();
    int numSplits = conf.getInt("mapreduce.job.maps", 1);
    GDInputFormat inputFormat = new GDInputFormat();
    List<InputSplit> splits = inputFormat.getSplitList(numSplits, conf);
    return splits;
}

From source file:com.transwarp.hbase.bulkload.combine.remote.CombineRemoteFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {

    long minSizeNode = 0;
    long minSizeRack = 0;
    long maxSize = 0;
    Configuration conf = job.getConfiguration();

    // the values specified by setxxxSplitSize() takes precedence over the
    // values that might have been specified in the config
    if (minSplitSizeNode != 0) {
        minSizeNode = minSplitSizeNode;//  ww  w. j  a v  a 2  s.  co m
    } else {
        minSizeNode = conf.getLong("mapred.min.split.size.per.node", 0);
    }
    if (minSplitSizeRack != 0) {
        minSizeRack = minSplitSizeRack;
    } else {
        minSizeRack = conf.getLong("mapred.min.split.size.per.rack", 0);
    }
    if (maxSplitSize != 0) {
        maxSize = maxSplitSize;
    } else {
        maxSize = conf.getLong("mapred.max.split.size", 0);
    }
    if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
        throw new IOException("Minimum split size pernode " + minSizeNode
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
        throw new IOException("Minimum split size per rack" + minSizeRack
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && minSizeNode > minSizeRack) {
        throw new IOException("Minimum split size per node" + minSizeNode
                + " cannot be smaller than minimum split " + "size per rack " + minSizeRack);
    }

    // all the files in input set
    Path[] paths = FileUtil.stat2Paths(listStatus(job).toArray(new FileStatus[0]));
    List<InputSplit> splits = new ArrayList<InputSplit>();
    if (paths.length == 0) {
        return splits;
    }

    // Convert them to Paths first. This is a costly operation and 
    // we should do it first, otherwise we will incur doing it multiple
    // times, one time each for each pool in the next loop.
    List<Path> newpaths = new LinkedList<Path>();
    for (int i = 0; i < paths.length; i++) {
        //Path p = new Path(paths[i].toUri().getPath());
        Path p = paths[i];
        newpaths.add(p);
    }
    paths = null;

    // In one single iteration, process all the paths in a single pool.
    // Processing one pool at a time ensures that a split contains paths
    // from a single pool only.
    for (MultiPathFilter onepool : pools) {
        ArrayList<Path> myPaths = new ArrayList<Path>();

        // pick one input path. If it matches all the filters in a pool,
        // add it to the output set
        for (Iterator<Path> iter = newpaths.iterator(); iter.hasNext();) {
            Path p = iter.next();
            if (onepool.accept(p)) {
                myPaths.add(p); // add it to my output set
                iter.remove();
            }
        }
        // create splits for all files in this pool.
        getMoreSplits(job, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack,
                splits);
    }

    // create splits for all files that are not in any pool.
    getMoreSplits(job, newpaths.toArray(new Path[newpaths.size()]), maxSize, minSizeNode, minSizeRack, splits);

    // free up rackToNodes map
    rackToNodes.clear();
    return splits;
}

From source file:com.transwarp.hbase.bulkload.combine.remote.CombineRemoteFileInputFormat.java

License:Apache License

/**
 * Return all the splits in the specified set of paths
 *///from  w  ww. j ava 2 s . com
private void getMoreSplits(JobContext job, Path[] paths, long maxSize, long minSizeNode, long minSizeRack,
        List<InputSplit> splits) throws IOException {
    Configuration conf = job.getConfiguration();

    // all blocks for all the files in input set
    OneFileInfo[] files;

    // mapping from a rack name to the list of blocks it has
    HashMap<String, List<OneBlockInfo>> rackToBlocks = new HashMap<String, List<OneBlockInfo>>();

    // mapping from a block to the nodes on which it has replicas
    HashMap<OneBlockInfo, String[]> blockToNodes = new HashMap<OneBlockInfo, String[]>();

    // mapping from a node to the list of blocks that it contains
    HashMap<String, List<OneBlockInfo>> nodeToBlocks = new HashMap<String, List<OneBlockInfo>>();

    files = new OneFileInfo[paths.length];
    if (paths.length == 0) {
        return;
    }

    // populate all the blocks for all files
    long totLength = 0;
    for (int i = 0; i < paths.length; i++) {
        files[i] = new OneFileInfo(paths[i], conf, isSplitable(job, paths[i]), rackToBlocks, blockToNodes,
                nodeToBlocks, rackToNodes, maxSize);
        totLength += files[i].getLength();
    }

    ArrayList<OneBlockInfo> validBlocks = new ArrayList<OneBlockInfo>();
    Set<String> nodes = new HashSet<String>();
    long curSplitSize = 0;

    // process all nodes and create splits that are local
    // to a node. 
    for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = nodeToBlocks.entrySet().iterator(); iter
            .hasNext();) {

        Map.Entry<String, List<OneBlockInfo>> one = iter.next();
        nodes.add(one.getKey());
        List<OneBlockInfo> blocksInNode = one.getValue();

        // for each block, copy it into validBlocks. Delete it from 
        // blockToNodes so that the same block does not appear in 
        // two different splits.
        for (OneBlockInfo oneblock : blocksInNode) {
            if (blockToNodes.containsKey(oneblock)) {
                validBlocks.add(oneblock);
                blockToNodes.remove(oneblock);
                curSplitSize += oneblock.length;

                // if the accumulated split size exceeds the maximum, then 
                // create this split.
                if (maxSize != 0 && curSplitSize >= maxSize) {
                    // create an input split and add it to the splits array
                    addCreatedSplit(splits, nodes, validBlocks);
                    curSplitSize = 0;
                    validBlocks.clear();
                }
            }
        }
        // if there were any blocks left over and their combined size is
        // larger than minSplitNode, then combine them into one split.
        // Otherwise add them back to the unprocessed pool. It is likely 
        // that they will be combined with other blocks from the 
        // same rack later on.
        if (minSizeNode != 0 && curSplitSize >= minSizeNode) {
            // create an input split and add it to the splits array
            addCreatedSplit(splits, nodes, validBlocks);
        } else {
            for (OneBlockInfo oneblock : validBlocks) {
                blockToNodes.put(oneblock, oneblock.hosts);
            }
        }
        validBlocks.clear();
        nodes.clear();
        curSplitSize = 0;
    }

    // if blocks in a rack are below the specified minimum size, then keep them
    // in 'overflow'. After the processing of all racks is complete, these 
    // overflow blocks will be combined into splits.
    ArrayList<OneBlockInfo> overflowBlocks = new ArrayList<OneBlockInfo>();
    Set<String> racks = new HashSet<String>();

    // Process all racks over and over again until there is no more work to do.
    while (blockToNodes.size() > 0) {

        // Create one split for this rack before moving over to the next rack. 
        // Come back to this rack after creating a single split for each of the 
        // remaining racks.
        // Process one rack location at a time, Combine all possible blocks that
        // reside on this rack as one split. (constrained by minimum and maximum
        // split size).

        // iterate over all racks 
        for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = rackToBlocks.entrySet().iterator(); iter
                .hasNext();) {

            Map.Entry<String, List<OneBlockInfo>> one = iter.next();
            racks.add(one.getKey());
            List<OneBlockInfo> blocks = one.getValue();

            // for each block, copy it into validBlocks. Delete it from 
            // blockToNodes so that the same block does not appear in 
            // two different splits.
            boolean createdSplit = false;
            for (OneBlockInfo oneblock : blocks) {
                if (blockToNodes.containsKey(oneblock)) {
                    validBlocks.add(oneblock);
                    blockToNodes.remove(oneblock);
                    curSplitSize += oneblock.length;

                    // if the accumulated split size exceeds the maximum, then 
                    // create this split.
                    if (maxSize != 0 && curSplitSize >= maxSize) {
                        // create an input split and add it to the splits array
                        addCreatedSplit(splits, getHosts(racks), validBlocks);
                        createdSplit = true;
                        break;
                    }
                }
            }

            // if we created a split, then just go to the next rack
            if (createdSplit) {
                curSplitSize = 0;
                validBlocks.clear();
                racks.clear();
                continue;
            }

            if (!validBlocks.isEmpty()) {
                if (minSizeRack != 0 && curSplitSize >= minSizeRack) {
                    // if there is a minimum size specified, then create a single split
                    // otherwise, store these blocks into overflow data structure
                    addCreatedSplit(splits, getHosts(racks), validBlocks);
                } else {
                    // There were a few blocks in this rack that 
                    // remained to be processed. Keep them in 'overflow' block list. 
                    // These will be combined later.
                    overflowBlocks.addAll(validBlocks);
                }
            }
            curSplitSize = 0;
            validBlocks.clear();
            racks.clear();
        }
    }

    assert blockToNodes.isEmpty();
    assert curSplitSize == 0;
    assert validBlocks.isEmpty();
    assert racks.isEmpty();

    // Process all overflow blocks
    for (OneBlockInfo oneblock : overflowBlocks) {
        validBlocks.add(oneblock);
        curSplitSize += oneblock.length;

        // This might cause an exiting rack location to be re-added,
        // but it should be ok.
        for (int i = 0; i < oneblock.racks.length; i++) {
            racks.add(oneblock.racks[i]);
        }

        // if the accumulated split size exceeds the maximum, then 
        // create this split.
        if (maxSize != 0 && curSplitSize >= maxSize) {
            // create an input split and add it to the splits array
            addCreatedSplit(splits, getHosts(racks), validBlocks);
            curSplitSize = 0;
            validBlocks.clear();
            racks.clear();
        }
    }

    // Process any remaining blocks, if any.
    if (!validBlocks.isEmpty()) {
        addCreatedSplit(splits, getHosts(racks), validBlocks);
    }
}

From source file:com.twitter.algebra.matrix.format.MatrixOutputFormat.java

License:Apache License

@Override
public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {
    if (baseOut == null) {
        getBaseOutputFormat(context.getConfiguration());
    }//from  ww  w . j  a  v a2s  .  c  o  m
    super.checkOutputSpecs(context);
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

/**
 * Go through each original inputsplit, get its file path, and check the
 *  index file,/*from   www .j  ava  2 s  .  c o  m*/
 * a)  keep it, when there is no index prebuilt on this file
 *  (or the index file doesn't match with the base file's checksum;
 * b)  remove it when no matching value is found in existing index file;
 * c)  construct new smaller inputsplits using indexed blocks found
 * in the index file;
 */
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {

    String inputformat = job.getConfiguration().get(REALINPUTFORMAT);
    String valueClass = job.getConfiguration().get(VALUECLASS);

    List<InputSplit> filteredList = new ArrayList<InputSplit>();

    FileInputFormat<K, V> realInputFormat = getInputFormatClass(inputformat, valueClass);

    List<InputSplit> splits = realInputFormat.getSplits(job);

    //if indexing jobs, don't skip any input splits.
    //if searching job but no searching filter, skip the index as well.
    if (isIndexingJob(job) || getFilterCondition(job) == null)
        return splits;

    Path prevFile = null; // remember the last input file we saw
    boolean foundIndexedFile = false; // is there a index file for
    // prevFile?
    boolean firstTime = true; // is this the first time we see this file?

    long totalOriginalBytes = 0; //the bytes to be scanned without indexes.
    totalBytesNewSplits = 0;
    long startTime = System.currentTimeMillis();
    LOG.info("start filtering out original input splits (total " + splits.size() + ") using indexes");
    Configuration conf = job.getConfiguration();
    long splitMaxSize;

    // for each original input split check if we can filter it out.
    for (InputSplit split : splits) {
        FileSplit fileSplit = (FileSplit) split;
        Path path = fileSplit.getPath();
        splitLength = fileSplit.getLength();
        totalOriginalBytes += fileSplit.getLength();
        splitMaxSize = Math.max(splitLength,
                conf.getInt(INDEXED_SPLIT_SIZE, conf.getInt("dfs.block.size", 256 * 1024 * 1024)));

        /*
         * for each new file we see, we first check if it has been indexed or not;
         * if not, we just add the original input split; if yes, we use the index
         * file to add filtered splits for the file
         */
        if (prevFile != null && path.equals(prevFile)) {
            firstTime = false;
        } else {
            prevFile = path;
            firstTime = true;
            foundIndexedFile = foundIndexFile(job, path);
        }

        // if no index file, we'll have to read all original input
        // splits
        if (!foundIndexedFile)
            filteredList.add(fileSplit);
        else {
            // for each file we only add once its filtered input splits using index
            // file
            if (firstTime) {
                // LOG.info("first time saw " + path
                // + ", adding filtered splits from index file");
                filteredList.addAll(getFilteredSplits(job, path, fileSplit.getLocations(), splitMaxSize));
            }
        }
    }

    long endTime = System.currentTimeMillis();
    LOG.info("finished filtering out input splits, now total splits:" + filteredList.size() + ", seconds used: "
            + (endTime - startTime) / 1000);
    LOG.info(String.format("total bytes to read before filtering: %s," + " after filtering %s, bytes ratio: %s",
            totalOriginalBytes, totalBytesNewSplits, totalOriginalBytes / Math.max(1, totalBytesNewSplits)));
    return filteredList;
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

/**
 * @param context//  w w  w.j a v a  2s  . co  m
 * @return the index directory provided to the job
 */
public static String getIndexDir(JobContext context) {
    return context.getConfiguration().get(INDEXDIR);
}