List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration
public Configuration getConfiguration();
From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java
License:Apache License
/** * @param context// w ww . j a va 2 s . co m * @param file * an input file to work on provided to the job * @return true if there is a index file for the input file * @throws IOException */ public static boolean foundIndexFile(JobContext context, Path file) throws IOException { Configuration conf = context.getConfiguration(); FileSystem fs = file.getFileSystem(conf); Path indexFilePath = new Path(getIndexDir(context) + file.toUri().getRawPath() + "/" + BlockIndexedFileInputFormat.INDEXMETAFILENAME); if (!fs.exists(indexFilePath)) { LOG.info("no index file found for input file:" + file + " at location " + indexFilePath); return false; } FSDataInputStream in = fs.open(indexFilePath); ThriftWritable<FileIndexDescriptor> writable = ThriftWritable.newInstance(FileIndexDescriptor.class); writable.readFields(in); FileIndexDescriptor indexDescriptor = writable.get(); in.close(); return verifyInputFileCheckSum(indexDescriptor, context); }
From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java
License:Apache License
/** * @param indexDescriptor//from w w w.j av a 2 s . co m * @param context * @return true if the current version of the base file's checksum * matches what was stored in the indexDescriptor. * @throws IOException */ protected static boolean verifyInputFileCheckSum(FileIndexDescriptor indexDescriptor, JobContext context) throws IOException { Configuration conf = context.getConfiguration(); FileSystem fs = FileSystem.get(conf); Path file = new Path(indexDescriptor.getSourcePath()); FileChecksum oldChecksum = indexDescriptor.getChecksum(); // check InputFile Checksum. org.apache.hadoop.fs.FileChecksum cksum = fs.getFileChecksum(file); if (cksum != null) { FileChecksum newCksum = new FileChecksum(cksum.getAlgorithmName(), ByteBuffer.wrap(cksum.getBytes()), cksum.getLength()); return (newCksum.equals(oldChecksum)); } return true; }
From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java
License:Apache License
private List<LongPairWritable> getFilterQualifiedBlocks(JobContext context, Path file, BinaryExpression filterCondition, long splitMaxSize) throws IOException { Expression lhs = filterCondition.getLhs(); Expression rhs = filterCondition.getRhs(); if (filterCondition.getOpType() == OpType.OP_EQ) { // "leaf node" // handle cases like 'abcd' == column , column == 'abcd' if (rhs instanceof Column && lhs instanceof Const) { lhs = filterCondition.getRhs(); rhs = filterCondition.getLhs(); }/*from w ww .j ava 2s.co m*/ String columnName = ((Column) lhs).getName(); String value = ((String) ((Const) rhs).getValue()); Text searchedValue = new Text(value); FileStatus[] dirlist = listIndexFiles(context, file, columnName); int part_num = dirlist.length; int part_seqnum = (new HashPartitioner<Text, Text>()).getPartition(searchedValue, searchedValue, part_num); String part_name = "/part-r-" + String.format("%05d", part_seqnum); FileSystem fs = file.getFileSystem(context.getConfiguration()); MapFile.Reader mapFileIndexReader = new MapFile.Reader(fs, getIndexDir(context) + file.toUri().getRawPath() + "/" + columnName + part_name, context.getConfiguration()); ListLongPair indexedBlocks = new ListLongPair(); mapFileIndexReader.get(searchedValue, indexedBlocks); mapFileIndexReader.close(); return indexedBlocks.get(); } List<LongPairWritable> blocksLeft = getFilterQualifiedBlocks(context, file, (BinaryExpression) lhs, splitMaxSize); List<LongPairWritable> blocksRight = getFilterQualifiedBlocks(context, file, (BinaryExpression) rhs, splitMaxSize); if (filterCondition.getOpType() == OpType.OP_AND) return andFilter(blocksLeft, blocksRight); else if (filterCondition.getOpType() == OpType.OP_OR) { return orFilter(blocksLeft, blocksRight, splitMaxSize); } else throw new IOException("not supported filter condition:" + filterCondition); }
From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java
License:Apache License
/** * @param context//ww w. j a va 2s . co m * @param file * the input file provided to the job to work on * @param columnName * @return the list of index files if there is an index directory created for * the input file * @throws IOException */ protected static FileStatus[] listIndexFiles(JobContext context, Path file, String columnName) throws IOException { Path indexFilePath = new Path(getIndexDir(context) + file.toUri().getRawPath() + "/" + columnName); FileSystem fs = file.getFileSystem(context.getConfiguration()); FileStatus[] dirlist = fs.listStatus(indexFilePath, indexFileFilter); return dirlist; }
From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java
License:Apache License
protected boolean noFilterCondition(JobContext context) { return context.getConfiguration().get(FILTERCONDITIONS) == null; }
From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java
License:Apache License
protected BinaryExpression getFilterCondition(JobContext context) throws IOException { if (filter != null) { return filter; }/* w w w. j a v a 2 s. c om*/ String filterString = context.getConfiguration().get(FILTERCONDITIONS); if (filterString == null) { return null; } return com.twitter.elephanttwin.retrieval.Expression.getFilterCondition(filterString); }
From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java
License:Apache License
protected boolean isIndexingJob(JobContext context) { return context.getConfiguration().getBoolean(INDEXINGJOBFLAG, true); }
From source file:com.twitter.elephanttwin.retrieval.OneSplitInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { Configuration conf = job.getConfiguration(); FileSplit split = (FileSplit) super.getSplits(job).get(0); List<InputSplit> lists = new ArrayList<InputSplit>(); lists.add(new FileSplit(split.getPath(), conf.getLong(START, 0), conf.getLong(END, 0) - conf.getLong(START, 0), split.getLocations())); return lists; }
From source file:com.twitter.hraven.mapreduce.CombineFileInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSizeNode = 0; long minSizeRack = 0; long maxSize = 0; Configuration conf = job.getConfiguration(); // the values specified by setxxxSplitSize() takes precedence over the // values that might have been specified in the config if (minSplitSizeNode != 0) { minSizeNode = minSplitSizeNode;/*from ww w . j a v a 2 s . c o m*/ } else { minSizeNode = conf.getLong("mapred.min.split.size.per.node", 0); } if (minSplitSizeRack != 0) { minSizeRack = minSplitSizeRack; } else { minSizeRack = conf.getLong("mapred.min.split.size.per.rack", 0); } if (maxSplitSize != 0) { maxSize = maxSplitSize; } else { maxSize = conf.getLong("mapred.max.split.size", 0); } if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) { throw new IOException("Minimum split size pernode " + minSizeNode + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) { throw new IOException("Minimum split size per rack" + minSizeRack + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && minSizeNode > minSizeRack) { throw new IOException("Minimum split size per node" + minSizeNode + " cannot be smaller than minimum split " + "size per rack " + minSizeRack); } // all the files in input set Path[] paths = FileUtil.stat2Paths(listStatus(job).toArray(new FileStatus[0])); List<InputSplit> splits = new ArrayList<InputSplit>(); if (paths.length == 0) { return splits; } // Convert them to Paths first. This is a costly operation and // we should do it first, otherwise we will incur doing it multiple // times, one time each for each pool in the next loop. List<Path> newpaths = new LinkedList<Path>(); for (int i = 0; i < paths.length; i++) { Path p = new Path(paths[i].toUri().getPath()); newpaths.add(p); } paths = null; System.out.println("Getting splits for: " + newpaths.size() + " paths."); // In one single iteration, process all the paths in a single pool. // Processing one pool at a time ensures that a split contains paths // from a single pool only. for (MultiPathFilter onepool : pools) { ArrayList<Path> myPaths = new ArrayList<Path>(); System.out.println("Getting splits for a pool"); // pick one input path. If it matches all the filters in a pool, // add it to the output set for (Iterator<Path> iter = newpaths.iterator(); iter.hasNext();) { Path p = iter.next(); if (onepool.accept(p)) { myPaths.add(p); // add it to my output set iter.remove(); } } System.out.println("Getting splits. myPaths size: " + myPaths.size()); // create splits for all files in this pool. getMoreSplits(conf, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack, splits); } // create splits for all files that are not in any pool. getMoreSplits(conf, newpaths.toArray(new Path[newpaths.size()]), maxSize, minSizeNode, minSizeRack, splits); // free up rackToNodes map rackToNodes.clear(); return splits; }
From source file:com.uber.hoodie.hadoop.HoodieHiveUtil.java
License:Apache License
public static Integer readMaxCommits(JobContext job, String tableName) { String maxCommitName = String.format(HOODIE_MAX_COMMIT_PATTERN, tableName); int maxCommits = job.getConfiguration().getInt(maxCommitName, DEFAULT_MAX_COMMITS); if (maxCommits == MAX_COMMIT_ALL) { maxCommits = Integer.MAX_VALUE; }/* ww w . j av a 2s. c o m*/ LOG.info("Read max commits - " + maxCommits); return maxCommits; }