Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

/**
 * @param context// w  ww  .  j a va  2 s . co m
 * @param file
 *          an input file to work on provided to the job
 * @return true if there is a index file for the input file
 * @throws IOException
 */
public static boolean foundIndexFile(JobContext context, Path file) throws IOException {

    Configuration conf = context.getConfiguration();
    FileSystem fs = file.getFileSystem(conf);
    Path indexFilePath = new Path(getIndexDir(context) + file.toUri().getRawPath() + "/"
            + BlockIndexedFileInputFormat.INDEXMETAFILENAME);
    if (!fs.exists(indexFilePath)) {
        LOG.info("no index file found for input file:" + file + " at location " + indexFilePath);
        return false;
    }
    FSDataInputStream in = fs.open(indexFilePath);

    ThriftWritable<FileIndexDescriptor> writable = ThriftWritable.newInstance(FileIndexDescriptor.class);
    writable.readFields(in);
    FileIndexDescriptor indexDescriptor = writable.get();
    in.close();
    return verifyInputFileCheckSum(indexDescriptor, context);
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

/**
 * @param indexDescriptor//from   w  w  w.j  av a  2 s .  co  m
 * @param context
 * @return true if the current version of the base file's checksum
 * matches what was stored in the indexDescriptor.
 * @throws IOException
 */
protected static boolean verifyInputFileCheckSum(FileIndexDescriptor indexDescriptor, JobContext context)
        throws IOException {

    Configuration conf = context.getConfiguration();
    FileSystem fs = FileSystem.get(conf);

    Path file = new Path(indexDescriptor.getSourcePath());
    FileChecksum oldChecksum = indexDescriptor.getChecksum();

    // check InputFile Checksum.
    org.apache.hadoop.fs.FileChecksum cksum = fs.getFileChecksum(file);
    if (cksum != null) {
        FileChecksum newCksum = new FileChecksum(cksum.getAlgorithmName(), ByteBuffer.wrap(cksum.getBytes()),
                cksum.getLength());
        return (newCksum.equals(oldChecksum));
    }
    return true;
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

private List<LongPairWritable> getFilterQualifiedBlocks(JobContext context, Path file,
        BinaryExpression filterCondition, long splitMaxSize) throws IOException {

    Expression lhs = filterCondition.getLhs();
    Expression rhs = filterCondition.getRhs();

    if (filterCondition.getOpType() == OpType.OP_EQ) { // "leaf node"
        // handle cases like 'abcd' == column , column == 'abcd'
        if (rhs instanceof Column && lhs instanceof Const) {
            lhs = filterCondition.getRhs();
            rhs = filterCondition.getLhs();
        }/*from  w ww  .j  ava 2s.co m*/
        String columnName = ((Column) lhs).getName();
        String value = ((String) ((Const) rhs).getValue());
        Text searchedValue = new Text(value);

        FileStatus[] dirlist = listIndexFiles(context, file, columnName);
        int part_num = dirlist.length;
        int part_seqnum = (new HashPartitioner<Text, Text>()).getPartition(searchedValue, searchedValue,
                part_num);
        String part_name = "/part-r-" + String.format("%05d", part_seqnum);
        FileSystem fs = file.getFileSystem(context.getConfiguration());
        MapFile.Reader mapFileIndexReader = new MapFile.Reader(fs,
                getIndexDir(context) + file.toUri().getRawPath() + "/" + columnName + part_name,
                context.getConfiguration());
        ListLongPair indexedBlocks = new ListLongPair();
        mapFileIndexReader.get(searchedValue, indexedBlocks);
        mapFileIndexReader.close();
        return indexedBlocks.get();
    }

    List<LongPairWritable> blocksLeft = getFilterQualifiedBlocks(context, file, (BinaryExpression) lhs,
            splitMaxSize);
    List<LongPairWritable> blocksRight = getFilterQualifiedBlocks(context, file, (BinaryExpression) rhs,
            splitMaxSize);

    if (filterCondition.getOpType() == OpType.OP_AND)
        return andFilter(blocksLeft, blocksRight);
    else if (filterCondition.getOpType() == OpType.OP_OR) {
        return orFilter(blocksLeft, blocksRight, splitMaxSize);
    } else
        throw new IOException("not supported filter condition:" + filterCondition);
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

/**
 * @param context//ww  w. j a va 2s .  co m
 * @param file
 *          the input file provided to the job to work on
 * @param columnName
 * @return the list of index files if there is an index directory created for
 *         the input file
 * @throws IOException
 */
protected static FileStatus[] listIndexFiles(JobContext context, Path file, String columnName)
        throws IOException {

    Path indexFilePath = new Path(getIndexDir(context) + file.toUri().getRawPath() + "/" + columnName);

    FileSystem fs = file.getFileSystem(context.getConfiguration());
    FileStatus[] dirlist = fs.listStatus(indexFilePath, indexFileFilter);
    return dirlist;
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

protected boolean noFilterCondition(JobContext context) {
    return context.getConfiguration().get(FILTERCONDITIONS) == null;
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

protected BinaryExpression getFilterCondition(JobContext context) throws IOException {
    if (filter != null) {
        return filter;
    }/* w w  w.  j a v a 2  s. c  om*/
    String filterString = context.getConfiguration().get(FILTERCONDITIONS);
    if (filterString == null) {
        return null;
    }
    return com.twitter.elephanttwin.retrieval.Expression.getFilterCondition(filterString);
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

protected boolean isIndexingJob(JobContext context) {
    return context.getConfiguration().getBoolean(INDEXINGJOBFLAG, true);
}

From source file:com.twitter.elephanttwin.retrieval.OneSplitInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    Configuration conf = job.getConfiguration();
    FileSplit split = (FileSplit) super.getSplits(job).get(0);

    List<InputSplit> lists = new ArrayList<InputSplit>();

    lists.add(new FileSplit(split.getPath(), conf.getLong(START, 0),
            conf.getLong(END, 0) - conf.getLong(START, 0), split.getLocations()));
    return lists;
}

From source file:com.twitter.hraven.mapreduce.CombineFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {

    long minSizeNode = 0;
    long minSizeRack = 0;
    long maxSize = 0;
    Configuration conf = job.getConfiguration();

    // the values specified by setxxxSplitSize() takes precedence over the
    // values that might have been specified in the config
    if (minSplitSizeNode != 0) {
        minSizeNode = minSplitSizeNode;/*from ww w  .  j  a  v  a  2  s  .  c  o m*/
    } else {
        minSizeNode = conf.getLong("mapred.min.split.size.per.node", 0);
    }
    if (minSplitSizeRack != 0) {
        minSizeRack = minSplitSizeRack;
    } else {
        minSizeRack = conf.getLong("mapred.min.split.size.per.rack", 0);
    }
    if (maxSplitSize != 0) {
        maxSize = maxSplitSize;
    } else {
        maxSize = conf.getLong("mapred.max.split.size", 0);
    }
    if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
        throw new IOException("Minimum split size pernode " + minSizeNode
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
        throw new IOException("Minimum split size per rack" + minSizeRack
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && minSizeNode > minSizeRack) {
        throw new IOException("Minimum split size per node" + minSizeNode
                + " cannot be smaller than minimum split " + "size per rack " + minSizeRack);
    }

    // all the files in input set
    Path[] paths = FileUtil.stat2Paths(listStatus(job).toArray(new FileStatus[0]));
    List<InputSplit> splits = new ArrayList<InputSplit>();
    if (paths.length == 0) {
        return splits;
    }

    // Convert them to Paths first. This is a costly operation and 
    // we should do it first, otherwise we will incur doing it multiple
    // times, one time each for each pool in the next loop.
    List<Path> newpaths = new LinkedList<Path>();
    for (int i = 0; i < paths.length; i++) {
        Path p = new Path(paths[i].toUri().getPath());
        newpaths.add(p);
    }
    paths = null;

    System.out.println("Getting splits for: " + newpaths.size() + " paths.");

    // In one single iteration, process all the paths in a single pool.
    // Processing one pool at a time ensures that a split contains paths
    // from a single pool only.
    for (MultiPathFilter onepool : pools) {
        ArrayList<Path> myPaths = new ArrayList<Path>();

        System.out.println("Getting splits for a pool");

        // pick one input path. If it matches all the filters in a pool,
        // add it to the output set
        for (Iterator<Path> iter = newpaths.iterator(); iter.hasNext();) {
            Path p = iter.next();
            if (onepool.accept(p)) {
                myPaths.add(p); // add it to my output set
                iter.remove();
            }
        }
        System.out.println("Getting splits. myPaths size: " + myPaths.size());
        // create splits for all files in this pool.
        getMoreSplits(conf, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack,
                splits);
    }

    // create splits for all files that are not in any pool.
    getMoreSplits(conf, newpaths.toArray(new Path[newpaths.size()]), maxSize, minSizeNode, minSizeRack, splits);

    // free up rackToNodes map
    rackToNodes.clear();
    return splits;
}

From source file:com.uber.hoodie.hadoop.HoodieHiveUtil.java

License:Apache License

public static Integer readMaxCommits(JobContext job, String tableName) {
    String maxCommitName = String.format(HOODIE_MAX_COMMIT_PATTERN, tableName);
    int maxCommits = job.getConfiguration().getInt(maxCommitName, DEFAULT_MAX_COMMITS);
    if (maxCommits == MAX_COMMIT_ALL) {
        maxCommits = Integer.MAX_VALUE;
    }/*  ww w  .  j  av  a  2s. c o  m*/
    LOG.info("Read max commits - " + maxCommits);
    return maxCommits;
}