Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

/**
 * @param context// w  ww  .  j a va  2 s . co m
 * @param file
 *          an input file to work on provided to the job
 * @return true if there is a index file for the input file
 * @throws IOException
 */
public static boolean foundIndexFile(JobContext context, Path file) throws IOException {

    Configuration conf = context.getConfiguration();
    FileSystem fs = file.getFileSystem(conf);
    Path indexFilePath = new Path(getIndexDir(context) + file.toUri().getRawPath() + "/"
            + BlockIndexedFileInputFormat.INDEXMETAFILENAME);
    if (!fs.exists(indexFilePath)) {
        LOG.info("no index file found for input file:" + file + " at location " + indexFilePath);
        return false;
    }
    FSDataInputStream in = fs.open(indexFilePath);

    ThriftWritable<FileIndexDescriptor> writable = ThriftWritable.newInstance(FileIndexDescriptor.class);
    writable.readFields(in);
    FileIndexDescriptor indexDescriptor = writable.get();
    in.close();
    return verifyInputFileCheckSum(indexDescriptor, context);
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

/**
 * @param indexDescriptor//from   w  w  w.j  av a  2 s .  co  m
 * @param context
 * @return true if the current version of the base file's checksum
 * matches what was stored in the indexDescriptor.
 * @throws IOException
 */
protected static boolean verifyInputFileCheckSum(FileIndexDescriptor indexDescriptor, JobContext context)
        throws IOException {

    Configuration conf = context.getConfiguration();
    FileSystem fs = FileSystem.get(conf);

    Path file = new Path(indexDescriptor.getSourcePath());
    FileChecksum oldChecksum = indexDescriptor.getChecksum();

    // check InputFile Checksum.
    org.apache.hadoop.fs.FileChecksum cksum = fs.getFileChecksum(file);
    if (cksum != null) {
        FileChecksum newCksum = new FileChecksum(cksum.getAlgorithmName(), ByteBuffer.wrap(cksum.getBytes()),
                cksum.getLength());
        return (newCksum.equals(oldChecksum));
    }
    return true;
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

private List<LongPairWritable> getFilterQualifiedBlocks(JobContext context, Path file,
        BinaryExpression filterCondition, long splitMaxSize) throws IOException {

    Expression lhs = filterCondition.getLhs();
    Expression rhs = filterCondition.getRhs();

    if (filterCondition.getOpType() == OpType.OP_EQ) { // "leaf node"
        // handle cases like 'abcd' == column , column == 'abcd'
        if (rhs instanceof Column && lhs instanceof Const) {
            lhs = filterCondition.getRhs();
            rhs = filterCondition.getLhs();
        }/*from  w ww  .j  ava 2s.co m*/
        String columnName = ((Column) lhs).getName();
        String value = ((String) ((Const) rhs).getValue());
        Text searchedValue = new Text(value);

        FileStatus[] dirlist = listIndexFiles(context, file, columnName);
        int part_num = dirlist.length;
        int part_seqnum = (new HashPartitioner<Text, Text>()).getPartition(searchedValue, searchedValue,
                part_num);
        String part_name = "/part-r-" + String.format("%05d", part_seqnum);
        FileSystem fs = file.getFileSystem(context.getConfiguration());
        MapFile.Reader mapFileIndexReader = new MapFile.Reader(fs,
                getIndexDir(context) + file.toUri().getRawPath() + "/" + columnName + part_name,
                context.getConfiguration());
        ListLongPair indexedBlocks = new ListLongPair();
        mapFileIndexReader.get(searchedValue, indexedBlocks);
        mapFileIndexReader.close();
        return indexedBlocks.get();
    }

    List<LongPairWritable> blocksLeft = getFilterQualifiedBlocks(context, file, (BinaryExpression) lhs,
            splitMaxSize);
    List<LongPairWritable> blocksRight = getFilterQualifiedBlocks(context, file, (BinaryExpression) rhs,
            splitMaxSize);

    if (filterCondition.getOpType() == OpType.OP_AND)
        return andFilter(blocksLeft, blocksRight);
    else if (filterCondition.getOpType() == OpType.OP_OR) {
        return orFilter(blocksLeft, blocksRight, splitMaxSize);
    } else
        throw new IOException("not supported filter condition:" + filterCondition);
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

/**
 * @param context//ww  w. j a va 2s .  co m
 * @param file
 *          the input file provided to the job to work on
 * @param columnName
 * @return the list of index files if there is an index directory created for
 *         the input file
 * @throws IOException
 */
protected static FileStatus[] listIndexFiles(JobContext context, Path file, String columnName)
        throws IOException {

    Path indexFilePath = new Path(getIndexDir(context) + file.toUri().getRawPath() + "/" + columnName);

    FileSystem fs = file.getFileSystem(context.getConfiguration());
    FileStatus[] dirlist = fs.listStatus(indexFilePath, indexFileFilter);
    return dirlist;
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

protected boolean noFilterCondition(JobContext context) {
    return context.getConfiguration().get(FILTERCONDITIONS) == null;
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

protected BinaryExpression getFilterCondition(JobContext context) throws IOException {
    if (filter != null) {
        return filter;
    }/* w w  w.  j a v a 2  s. c  om*/
    String filterString = context.getConfiguration().get(FILTERCONDITIONS);
    if (filterString == null) {
        return null;
    }
    return com.twitter.elephanttwin.retrieval.Expression.getFilterCondition(filterString);
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

protected boolean isIndexingJob(JobContext context) {
    return context.getConfiguration().getBoolean(INDEXINGJOBFLAG, true);
}

From source file:com.twitter.elephanttwin.retrieval.OneSplitInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    Configuration conf = job.getConfiguration();
    FileSplit split = (FileSplit) super.getSplits(job).get(0);

    List<InputSplit> lists = new ArrayList<InputSplit>();

    lists.add(new FileSplit(split.getPath(), conf.getLong(START, 0),
            conf.getLong(END, 0) - conf.getLong(START, 0), split.getLocations()));
    return lists;
}

From source file:com.twitter.hraven.mapreduce.CombineFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {

    long minSizeNode = 0;
    long minSizeRack = 0;
    long maxSize = 0;
    Configuration conf = job.getConfiguration();

    // the values specified by setxxxSplitSize() takes precedence over the
    // values that might have been specified in the config
    if (minSplitSizeNode != 0) {
        minSizeNode = minSplitSizeNode;/*from ww w  .  j  a  v  a  2  s  .  c  o m*/
    } else {
        minSizeNode = conf.getLong("mapred.min.split.size.per.node", 0);
    }
    if (minSplitSizeRack != 0) {
        minSizeRack = minSplitSizeRack;
    } else {
        minSizeRack = conf.getLong("mapred.min.split.size.per.rack", 0);
    }
    if (maxSplitSize != 0) {
        maxSize = maxSplitSize;
    } else {
        maxSize = conf.getLong("mapred.max.split.size", 0);
    }
    if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
        throw new IOException("Minimum split size pernode " + minSizeNode
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
        throw new IOException("Minimum split size per rack" + minSizeRack
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && minSizeNode > minSizeRack) {
        throw new IOException("Minimum split size per node" + minSizeNode
                + " cannot be smaller than minimum split " + "size per rack " + minSizeRack);
    }

    // all the files in input set
    Path[] paths = FileUtil.stat2Paths(listStatus(job).toArray(new FileStatus[0]));
    List<InputSplit> splits = new ArrayList<InputSplit>();
    if (paths.length == 0) {
        return splits;
    }

    // Convert them to Paths first. This is a costly operation and 
    // we should do it first, otherwise we will incur doing it multiple
    // times, one time each for each pool in the next loop.
    List<Path> newpaths = new LinkedList<Path>();
    for (int i = 0; i < paths.length; i++) {
        Path p = new Path(paths[i].toUri().getPath());
        newpaths.add(p);
    }
    paths = null;

    System.out.println("Getting splits for: " + newpaths.size() + " paths.");

    // In one single iteration, process all the paths in a single pool.
    // Processing one pool at a time ensures that a split contains paths
    // from a single pool only.
    for (MultiPathFilter onepool : pools) {
        ArrayList<Path> myPaths = new ArrayList<Path>();

        System.out.println("Getting splits for a pool");

        // pick one input path. If it matches all the filters in a pool,
        // add it to the output set
        for (Iterator<Path> iter = newpaths.iterator(); iter.hasNext();) {
            Path p = iter.next();
            if (onepool.accept(p)) {
                myPaths.add(p); // add it to my output set
                iter.remove();
            }
        }
        System.out.println("Getting splits. myPaths size: " + myPaths.size());
        // create splits for all files in this pool.
        getMoreSplits(conf, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack,
                splits);
    }

    // create splits for all files that are not in any pool.
    getMoreSplits(conf, newpaths.toArray(new Path[newpaths.size()]), maxSize, minSizeNode, minSizeRack, splits);

    // free up rackToNodes map
    rackToNodes.clear();
    return splits;
}

From source file:com.uber.hoodie.hadoop.HoodieHiveUtil.java

License:Apache License

public static Integer readMaxCommits(JobContext job, String tableName) {
    String maxCommitName = String.format(HOODIE_MAX_COMMIT_PATTERN, tableName);
    int maxCommits = job.getConfiguration().getInt(maxCommitName, DEFAULT_MAX_COMMITS);
    if (maxCommits == MAX_COMMIT_ALL) {
        maxCommits = Integer.MAX_VALUE;
    }/*  ww w  .  j  av  a  2s. c o  m*/
    LOG.info("Read max commits - " + maxCommits);
    return maxCommits;
}