List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration
public Configuration getConfiguration();
From source file:com.linkedin.pinot.hadoop.io.PinotOutputFormat.java
License:Apache License
public static int getHllSize(JobContext context) { return context.getConfiguration().getInt(PinotOutputFormat.HLL_SIZE, 9); }
From source file:com.linkedin.pinot.hadoop.io.PinotOutputFormat.java
License:Apache License
public static String getHllColumns(JobContext context) { return context.getConfiguration().get(PinotOutputFormat.HLL_COLUMNS); }
From source file:com.linkedin.pinot.hadoop.io.PinotOutputFormat.java
License:Apache License
public static String getHllSuffix(JobContext context) { return context.getConfiguration().get(PinotOutputFormat.HLL_SUFFIX, HllConstants.DEFAULT_HLL_DERIVE_COLUMN_SUFFIX); }
From source file:com.linkedin.pinot.hadoop.io.PinotOutputFormat.java
License:Apache License
public static Class<?> getDataWriteSupportClass(JobContext context) { String className = context.getConfiguration().get(PinotOutputFormat.PINOT_RECORD_SERIALIZATION_CLASS); if (className == null) { throw new RuntimeException("pinot data write support class not set"); }//from ww w . jav a2s. c o m try { return context.getConfiguration().getClassByName(className); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } }
From source file:com.linkedin.whiteelephant.mapreduce.MyAvroMultipleOutputs.java
License:Apache License
@SuppressWarnings("unchecked") private static Class<? extends OutputFormat<?, ?>> getNamedOutputFormatClass(JobContext job, String namedOutput) {/*ww w.j a va2s. c om*/ return (Class<? extends OutputFormat<?, ?>>) job.getConfiguration() .getClass(MO_PREFIX + namedOutput + FORMAT, null, OutputFormat.class); }
From source file:com.marklogic.contentpump.CombineDocumentInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = super.getSplits(job); List<InputSplit> combinedSplits = new ArrayList<InputSplit>(); CombineDocumentSplit split = null;//from w w w .j a va 2 s . c o m for (InputSplit file : splits) { Path path = ((FileSplit) file).getPath(); FileSystem fs = path.getFileSystem(job.getConfiguration()); FileStatus status = fs.getFileStatus(path); long length = status.getLen(); long blockSize = status.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); if (length != 0) { if (split == null) { split = new CombineDocumentSplit(); } try { if (split.getLength() + length < splitSize || split.getLength() < minSize) { split.addSplit((FileSplit) file); } else { combinedSplits.add(split); split = new CombineDocumentSplit(); split.addSplit((FileSplit) file); } } catch (InterruptedException e) { LOG.error(e); throw new RuntimeException(e); } } } if (split != null) { combinedSplits.add(split); } if (LOG.isDebugEnabled()) { LOG.debug("Total # of splits: " + splits.size()); LOG.debug("Total # of combined splits: " + combinedSplits.size()); } return combinedSplits; }
From source file:com.marklogic.contentpump.DelimitedTextInputFormat.java
License:Apache License
public List<InputSplit> getSplits(JobContext job) throws IOException { boolean delimSplit = isSplitInput(job.getConfiguration()); //if delimSplit is true, size of each split is determined by //Math.max(minSize, Math.min(maxSize, blockSize)) in FileInputFormat List<InputSplit> splits = super.getSplits(job); if (!delimSplit) { return splits; }//from ww w . j a v a 2 s . c om if (splits.size() >= SPLIT_COUNT_LIMIT) { //if #splits > 1 million, there is enough parallelism //therefore no point to split LOG.warn("Exceeding SPLIT_COUNT_LIMIT, input_split is off:" + SPLIT_COUNT_LIMIT); DefaultStringifier.store(job.getConfiguration(), false, ConfigConstants.CONF_SPLIT_INPUT); return splits; } // add header info into splits List<InputSplit> populatedSplits = new ArrayList<InputSplit>(); LOG.info(splits.size() + " DelimitedSplits generated"); Configuration conf = job.getConfiguration(); char delimiter = 0; ArrayList<Text> hlist = new ArrayList<Text>(); for (InputSplit file : splits) { FileSplit fsplit = ((FileSplit) file); Path path = fsplit.getPath(); FileSystem fs = path.getFileSystem(conf); if (fsplit.getStart() == 0) { // parse the inSplit, get the header FSDataInputStream fileIn = fs.open(path); String delimStr = conf.get(ConfigConstants.CONF_DELIMITER, ConfigConstants.DEFAULT_DELIMITER); if (delimStr.length() == 1) { delimiter = delimStr.charAt(0); } else { LOG.error("Incorrect delimitor: " + delimiter + ". Expects single character."); } String encoding = conf.get(MarkLogicConstants.OUTPUT_CONTENT_ENCODING, MarkLogicConstants.DEFAULT_OUTPUT_CONTENT_ENCODING); InputStreamReader instream = new InputStreamReader(fileIn, encoding); CSVParser parser = new CSVParser(instream, CSVParserFormatter.getFormat(delimiter, DelimitedTextReader.encapsulator, true, true)); Iterator<CSVRecord> it = parser.iterator(); String[] header = null; if (it.hasNext()) { CSVRecord record = (CSVRecord) it.next(); Iterator<String> recordIterator = record.iterator(); int recordSize = record.size(); header = new String[recordSize]; for (int i = 0; i < recordSize; i++) { if (recordIterator.hasNext()) { header[i] = (String) recordIterator.next(); } else { throw new IOException("Record size doesn't match the real size"); } } EncodingUtil.handleBOMUTF8(header, 0); hlist.clear(); for (String s : header) { hlist.add(new Text(s)); } } instream.close(); } DelimitedSplit ds = new DelimitedSplit(new TextArrayWritable(hlist.toArray(new Text[hlist.size()])), path, fsplit.getStart(), fsplit.getLength(), fsplit.getLocations()); populatedSplits.add(ds); } return populatedSplits; }
From source file:com.marklogic.contentpump.DocumentInputFormat.java
License:Apache License
protected void getForestSplits(JobContext jobContext, ResultSequence result, List<ForestSplit> forestSplits, List<String> ruleUris) throws IOException { Configuration jobConf = jobContext.getConfiguration(); super.getForestSplits(jobContext, result, forestSplits, ruleUris); // Third while loop: audit settings while (result.hasNext()) { ResultItem item = result.next(); if (ItemType.XS_STRING != item.getItemType()) { throw new IOException("Unexpected item type " + item.getItemType().toString()); }/* w ww.j av a 2s . co m*/ String itemStr = ((XSString) item.getItem()).asString(); if ("AUDIT".equals(itemStr)) { continue; } else if ("mlcp-start".equals(itemStr)) { mlcpStartEventEnabled = true; } else if ("mlcp-finish".equalsIgnoreCase(itemStr)) { mlcpFinishEventEnabled = true; } else { throw new IOException("Unrecognized audit event " + itemStr); } } if (ruleUris != null && ruleUris.size() > 0) { AuditUtil.prepareAuditMlcpFinish(jobConf, ruleUris.size()); if (LOG.isDebugEnabled()) { // TODO: Use this version if only JAVA 8 is supported // String logMessage = String.join(", ", ruleUris); LOG.debug("Redaction rules applied: " + StringUtils.join(ruleUris, ", ")); } } if (mlcpStartEventEnabled) { AuditUtil.auditMlcpStart(jobConf, jobContext.getJobName()); } jobConf.setBoolean(ConfigConstants.CONF_AUDIT_MLCPFINISH_ENABLED, mlcpFinishEventEnabled); }
From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java
License:Apache License
@Override protected boolean isSplitable(JobContext context, Path filename) { Configuration conf = context.getConfiguration(); return conf.getBoolean(ConfigConstants.CONF_SPLIT_INPUT, false) && !conf.getBoolean(ConfigConstants.INPUT_COMPRESSED, false); }
From source file:com.marklogic.contentpump.FileAndDirectoryInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> splits = new ArrayList<InputSplit>(); Configuration conf = job.getConfiguration(); try {/*from w w w .j av a 2 s.co m*/ List<FileStatus> files = listStatus(job); long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); for (FileStatus child : files) { Path path = child.getPath(); FileSystem fs = path.getFileSystem(conf); // length is 0 for dir according to FSDirectory.java in 0.20 // however, w/ Hadoop2, dir in local fs has non-zero length long length = child.getLen(); BlockLocation[] blkLocations = null; if (!child.isDirectory() || fs instanceof DistributedFileSystem == false) { blkLocations = fs.getFileBlockLocations(child, 0, length); } else if (length != 0) { throw new IOException("non-zero length directory on HDFS:" + path.toUri().toString()); } if ((length != 0) && isSplitable(job, path)) { long blockSize = child.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } } catch (InvalidInputException ex) { String inPath = conf.get(ConfigConstants.CONF_INPUT_DIRECTORY); String pattern = conf.get(ConfigConstants.CONF_INPUT_FILE_PATTERN, ".*"); throw new IOException("No input files found with the specified input path " + inPath + " and input file pattern " + pattern, ex); } PathFilter jobFilter = getInputPathFilter(job); List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); // take a second pass of the splits generated to extract files from // directories int count = 0; // flatten directories until reaching SPLIT_COUNT_LIMIT while (count < splits.size() && splits.size() < SPLIT_COUNT_LIMIT) { FileSplit split = (FileSplit) splits.get(count); Path file = split.getPath(); FileSystem fs = file.getFileSystem(conf); FileStatus status = fs.getFileStatus(file); if (status.isDirectory()) { FileStatus[] children = fs.listStatus(file, inputFilter); if (children.length + count < SPLIT_COUNT_LIMIT) { splits.remove(count); for (FileStatus stat : children) { FileSplit child = new FileSplit(stat.getPath(), 0, stat.getLen(), null); splits.add(child); } } else { count++; } } else { count++; } } return splits; }