Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.thinkbiganalytics.kylo.catalog.spark.sources.spark.HighWaterMarkInputFormat.java

License:Apache License

/**
 * Gets the high water mark value.//from  w w w.  j  a  v a 2 s .  c  o m
 */
public static long getHighWaterMark(@Nonnull final JobContext job) {
    return job.getConfiguration().getLong(HIGH_WATER_MARK, Long.MIN_VALUE);
}

From source file:com.thinkbiganalytics.kylo.catalog.spark.sources.spark.HighWaterMarkInputFormat.java

License:Apache License

/**
 * Gets the maximum file age.// w w w . j a va  2  s  .  co m
 */
public static long getMaxFileAge(@Nonnull final JobContext job) {
    return job.getConfiguration().getLong(MAX_FILE_AGE, Long.MAX_VALUE);
}

From source file:com.thinkbiganalytics.kylo.catalog.spark.sources.spark.HighWaterMarkInputFormat.java

License:Apache License

/**
 * Gets the minimum file age./*from   w  w  w.  ja v a2 s  .c  o  m*/
 */
public static long getMinFileAge(@Nonnull final JobContext job) {
    return job.getConfiguration().getLong(MIN_FILE_AGE, Long.MIN_VALUE);
}

From source file:com.topsoft.botspider.avro.mapreduce.output.ExtFileOutputFormat.java

License:Apache License

/**
 * Set the base output name for output file to be created.
 *///ww w .j a v a2s.  c o m
public static void setOutputName(JobContext job, String name) {
    job.getConfiguration().set(BASE_OUTPUT_NAME, name);
}

From source file:com.toshiba.mwcloud.gs.hadoop.mapreduce.GSRowInputFormat.java

License:Apache License

/**
 * <div lang="ja">//from w w w .  j av a2s  .  c o m
 * GridDB?InputSplit???????<br/>
 * InputSplit?????mapreduce.job.maps? ?????????
 * @param context JobContext
 * @throws GSException GridDB??????
 * </div><div lang="en">
 * Generate a list of GridDB InputSplit objects.<br/>
 * The number of InputSplits will be the smaller of the number of partitions for input processing and the value of property mapreduce.job.maps.
 * @param context JobContext object
 * @throws GSException an exception occurred in GridDB
 * </div>
 */
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {

    Configuration conf = context.getConfiguration();
    int numSplits = conf.getInt("mapreduce.job.maps", 1);
    GDInputFormat inputFormat = new GDInputFormat();
    List<InputSplit> splits = inputFormat.getSplitList(numSplits, conf);
    return splits;
}

From source file:com.transwarp.hbase.bulkload.combine.remote.CombineRemoteFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {

    long minSizeNode = 0;
    long minSizeRack = 0;
    long maxSize = 0;
    Configuration conf = job.getConfiguration();

    // the values specified by setxxxSplitSize() takes precedence over the
    // values that might have been specified in the config
    if (minSplitSizeNode != 0) {
        minSizeNode = minSplitSizeNode;//  ww  w. j  a v  a 2  s.  co m
    } else {
        minSizeNode = conf.getLong("mapred.min.split.size.per.node", 0);
    }
    if (minSplitSizeRack != 0) {
        minSizeRack = minSplitSizeRack;
    } else {
        minSizeRack = conf.getLong("mapred.min.split.size.per.rack", 0);
    }
    if (maxSplitSize != 0) {
        maxSize = maxSplitSize;
    } else {
        maxSize = conf.getLong("mapred.max.split.size", 0);
    }
    if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
        throw new IOException("Minimum split size pernode " + minSizeNode
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
        throw new IOException("Minimum split size per rack" + minSizeRack
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && minSizeNode > minSizeRack) {
        throw new IOException("Minimum split size per node" + minSizeNode
                + " cannot be smaller than minimum split " + "size per rack " + minSizeRack);
    }

    // all the files in input set
    Path[] paths = FileUtil.stat2Paths(listStatus(job).toArray(new FileStatus[0]));
    List<InputSplit> splits = new ArrayList<InputSplit>();
    if (paths.length == 0) {
        return splits;
    }

    // Convert them to Paths first. This is a costly operation and 
    // we should do it first, otherwise we will incur doing it multiple
    // times, one time each for each pool in the next loop.
    List<Path> newpaths = new LinkedList<Path>();
    for (int i = 0; i < paths.length; i++) {
        //Path p = new Path(paths[i].toUri().getPath());
        Path p = paths[i];
        newpaths.add(p);
    }
    paths = null;

    // In one single iteration, process all the paths in a single pool.
    // Processing one pool at a time ensures that a split contains paths
    // from a single pool only.
    for (MultiPathFilter onepool : pools) {
        ArrayList<Path> myPaths = new ArrayList<Path>();

        // pick one input path. If it matches all the filters in a pool,
        // add it to the output set
        for (Iterator<Path> iter = newpaths.iterator(); iter.hasNext();) {
            Path p = iter.next();
            if (onepool.accept(p)) {
                myPaths.add(p); // add it to my output set
                iter.remove();
            }
        }
        // create splits for all files in this pool.
        getMoreSplits(job, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack,
                splits);
    }

    // create splits for all files that are not in any pool.
    getMoreSplits(job, newpaths.toArray(new Path[newpaths.size()]), maxSize, minSizeNode, minSizeRack, splits);

    // free up rackToNodes map
    rackToNodes.clear();
    return splits;
}

From source file:com.transwarp.hbase.bulkload.combine.remote.CombineRemoteFileInputFormat.java

License:Apache License

/**
 * Return all the splits in the specified set of paths
 *///from  w  ww. j ava 2 s . com
private void getMoreSplits(JobContext job, Path[] paths, long maxSize, long minSizeNode, long minSizeRack,
        List<InputSplit> splits) throws IOException {
    Configuration conf = job.getConfiguration();

    // all blocks for all the files in input set
    OneFileInfo[] files;

    // mapping from a rack name to the list of blocks it has
    HashMap<String, List<OneBlockInfo>> rackToBlocks = new HashMap<String, List<OneBlockInfo>>();

    // mapping from a block to the nodes on which it has replicas
    HashMap<OneBlockInfo, String[]> blockToNodes = new HashMap<OneBlockInfo, String[]>();

    // mapping from a node to the list of blocks that it contains
    HashMap<String, List<OneBlockInfo>> nodeToBlocks = new HashMap<String, List<OneBlockInfo>>();

    files = new OneFileInfo[paths.length];
    if (paths.length == 0) {
        return;
    }

    // populate all the blocks for all files
    long totLength = 0;
    for (int i = 0; i < paths.length; i++) {
        files[i] = new OneFileInfo(paths[i], conf, isSplitable(job, paths[i]), rackToBlocks, blockToNodes,
                nodeToBlocks, rackToNodes, maxSize);
        totLength += files[i].getLength();
    }

    ArrayList<OneBlockInfo> validBlocks = new ArrayList<OneBlockInfo>();
    Set<String> nodes = new HashSet<String>();
    long curSplitSize = 0;

    // process all nodes and create splits that are local
    // to a node. 
    for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = nodeToBlocks.entrySet().iterator(); iter
            .hasNext();) {

        Map.Entry<String, List<OneBlockInfo>> one = iter.next();
        nodes.add(one.getKey());
        List<OneBlockInfo> blocksInNode = one.getValue();

        // for each block, copy it into validBlocks. Delete it from 
        // blockToNodes so that the same block does not appear in 
        // two different splits.
        for (OneBlockInfo oneblock : blocksInNode) {
            if (blockToNodes.containsKey(oneblock)) {
                validBlocks.add(oneblock);
                blockToNodes.remove(oneblock);
                curSplitSize += oneblock.length;

                // if the accumulated split size exceeds the maximum, then 
                // create this split.
                if (maxSize != 0 && curSplitSize >= maxSize) {
                    // create an input split and add it to the splits array
                    addCreatedSplit(splits, nodes, validBlocks);
                    curSplitSize = 0;
                    validBlocks.clear();
                }
            }
        }
        // if there were any blocks left over and their combined size is
        // larger than minSplitNode, then combine them into one split.
        // Otherwise add them back to the unprocessed pool. It is likely 
        // that they will be combined with other blocks from the 
        // same rack later on.
        if (minSizeNode != 0 && curSplitSize >= minSizeNode) {
            // create an input split and add it to the splits array
            addCreatedSplit(splits, nodes, validBlocks);
        } else {
            for (OneBlockInfo oneblock : validBlocks) {
                blockToNodes.put(oneblock, oneblock.hosts);
            }
        }
        validBlocks.clear();
        nodes.clear();
        curSplitSize = 0;
    }

    // if blocks in a rack are below the specified minimum size, then keep them
    // in 'overflow'. After the processing of all racks is complete, these 
    // overflow blocks will be combined into splits.
    ArrayList<OneBlockInfo> overflowBlocks = new ArrayList<OneBlockInfo>();
    Set<String> racks = new HashSet<String>();

    // Process all racks over and over again until there is no more work to do.
    while (blockToNodes.size() > 0) {

        // Create one split for this rack before moving over to the next rack. 
        // Come back to this rack after creating a single split for each of the 
        // remaining racks.
        // Process one rack location at a time, Combine all possible blocks that
        // reside on this rack as one split. (constrained by minimum and maximum
        // split size).

        // iterate over all racks 
        for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = rackToBlocks.entrySet().iterator(); iter
                .hasNext();) {

            Map.Entry<String, List<OneBlockInfo>> one = iter.next();
            racks.add(one.getKey());
            List<OneBlockInfo> blocks = one.getValue();

            // for each block, copy it into validBlocks. Delete it from 
            // blockToNodes so that the same block does not appear in 
            // two different splits.
            boolean createdSplit = false;
            for (OneBlockInfo oneblock : blocks) {
                if (blockToNodes.containsKey(oneblock)) {
                    validBlocks.add(oneblock);
                    blockToNodes.remove(oneblock);
                    curSplitSize += oneblock.length;

                    // if the accumulated split size exceeds the maximum, then 
                    // create this split.
                    if (maxSize != 0 && curSplitSize >= maxSize) {
                        // create an input split and add it to the splits array
                        addCreatedSplit(splits, getHosts(racks), validBlocks);
                        createdSplit = true;
                        break;
                    }
                }
            }

            // if we created a split, then just go to the next rack
            if (createdSplit) {
                curSplitSize = 0;
                validBlocks.clear();
                racks.clear();
                continue;
            }

            if (!validBlocks.isEmpty()) {
                if (minSizeRack != 0 && curSplitSize >= minSizeRack) {
                    // if there is a minimum size specified, then create a single split
                    // otherwise, store these blocks into overflow data structure
                    addCreatedSplit(splits, getHosts(racks), validBlocks);
                } else {
                    // There were a few blocks in this rack that 
                    // remained to be processed. Keep them in 'overflow' block list. 
                    // These will be combined later.
                    overflowBlocks.addAll(validBlocks);
                }
            }
            curSplitSize = 0;
            validBlocks.clear();
            racks.clear();
        }
    }

    assert blockToNodes.isEmpty();
    assert curSplitSize == 0;
    assert validBlocks.isEmpty();
    assert racks.isEmpty();

    // Process all overflow blocks
    for (OneBlockInfo oneblock : overflowBlocks) {
        validBlocks.add(oneblock);
        curSplitSize += oneblock.length;

        // This might cause an exiting rack location to be re-added,
        // but it should be ok.
        for (int i = 0; i < oneblock.racks.length; i++) {
            racks.add(oneblock.racks[i]);
        }

        // if the accumulated split size exceeds the maximum, then 
        // create this split.
        if (maxSize != 0 && curSplitSize >= maxSize) {
            // create an input split and add it to the splits array
            addCreatedSplit(splits, getHosts(racks), validBlocks);
            curSplitSize = 0;
            validBlocks.clear();
            racks.clear();
        }
    }

    // Process any remaining blocks, if any.
    if (!validBlocks.isEmpty()) {
        addCreatedSplit(splits, getHosts(racks), validBlocks);
    }
}

From source file:com.twitter.algebra.matrix.format.MatrixOutputFormat.java

License:Apache License

@Override
public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {
    if (baseOut == null) {
        getBaseOutputFormat(context.getConfiguration());
    }//from  ww  w . j  a  v a2s  .  c  o  m
    super.checkOutputSpecs(context);
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

/**
 * Go through each original inputsplit, get its file path, and check the
 *  index file,/*from   www .j  ava  2 s  .  c o  m*/
 * a)  keep it, when there is no index prebuilt on this file
 *  (or the index file doesn't match with the base file's checksum;
 * b)  remove it when no matching value is found in existing index file;
 * c)  construct new smaller inputsplits using indexed blocks found
 * in the index file;
 */
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {

    String inputformat = job.getConfiguration().get(REALINPUTFORMAT);
    String valueClass = job.getConfiguration().get(VALUECLASS);

    List<InputSplit> filteredList = new ArrayList<InputSplit>();

    FileInputFormat<K, V> realInputFormat = getInputFormatClass(inputformat, valueClass);

    List<InputSplit> splits = realInputFormat.getSplits(job);

    //if indexing jobs, don't skip any input splits.
    //if searching job but no searching filter, skip the index as well.
    if (isIndexingJob(job) || getFilterCondition(job) == null)
        return splits;

    Path prevFile = null; // remember the last input file we saw
    boolean foundIndexedFile = false; // is there a index file for
    // prevFile?
    boolean firstTime = true; // is this the first time we see this file?

    long totalOriginalBytes = 0; //the bytes to be scanned without indexes.
    totalBytesNewSplits = 0;
    long startTime = System.currentTimeMillis();
    LOG.info("start filtering out original input splits (total " + splits.size() + ") using indexes");
    Configuration conf = job.getConfiguration();
    long splitMaxSize;

    // for each original input split check if we can filter it out.
    for (InputSplit split : splits) {
        FileSplit fileSplit = (FileSplit) split;
        Path path = fileSplit.getPath();
        splitLength = fileSplit.getLength();
        totalOriginalBytes += fileSplit.getLength();
        splitMaxSize = Math.max(splitLength,
                conf.getInt(INDEXED_SPLIT_SIZE, conf.getInt("dfs.block.size", 256 * 1024 * 1024)));

        /*
         * for each new file we see, we first check if it has been indexed or not;
         * if not, we just add the original input split; if yes, we use the index
         * file to add filtered splits for the file
         */
        if (prevFile != null && path.equals(prevFile)) {
            firstTime = false;
        } else {
            prevFile = path;
            firstTime = true;
            foundIndexedFile = foundIndexFile(job, path);
        }

        // if no index file, we'll have to read all original input
        // splits
        if (!foundIndexedFile)
            filteredList.add(fileSplit);
        else {
            // for each file we only add once its filtered input splits using index
            // file
            if (firstTime) {
                // LOG.info("first time saw " + path
                // + ", adding filtered splits from index file");
                filteredList.addAll(getFilteredSplits(job, path, fileSplit.getLocations(), splitMaxSize));
            }
        }
    }

    long endTime = System.currentTimeMillis();
    LOG.info("finished filtering out input splits, now total splits:" + filteredList.size() + ", seconds used: "
            + (endTime - startTime) / 1000);
    LOG.info(String.format("total bytes to read before filtering: %s," + " after filtering %s, bytes ratio: %s",
            totalOriginalBytes, totalBytesNewSplits, totalOriginalBytes / Math.max(1, totalBytesNewSplits)));
    return filteredList;
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

/**
 * @param context//  w w  w.j a v a  2s  . co  m
 * @return the index directory provided to the job
 */
public static String getIndexDir(JobContext context) {
    return context.getConfiguration().get(INDEXDIR);
}