List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration
public Configuration getConfiguration();
From source file:org.apache.carbondata.hadoop.api.CarbonInputFormat.java
License:Apache License
/** * get data blocks of given segment/* w w w .j a v a 2s.c o m*/ */ protected List<CarbonInputSplit> getDataBlocksOfSegment(JobContext job, CarbonTable carbonTable, Expression expression, BitSet matchedPartitions, List<Segment> segmentIds, PartitionInfo partitionInfo, List<Integer> oldPartitionIdList) throws IOException { QueryStatisticsRecorder recorder = CarbonTimeStatisticsFactory.createDriverRecorder(); QueryStatistic statistic = new QueryStatistic(); // get tokens for all the required FileSystem for table path TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { new Path(carbonTable.getTablePath()) }, job.getConfiguration()); List<ExtendedBlocklet> prunedBlocklets = getPrunedBlocklets(job, carbonTable, expression, segmentIds); List<CarbonInputSplit> resultFilteredBlocks = new ArrayList<>(); int partitionIndex = 0; List<Integer> partitionIdList = new ArrayList<>(); if (partitionInfo != null && partitionInfo.getPartitionType() != PartitionType.NATIVE_HIVE) { partitionIdList = partitionInfo.getPartitionIds(); } for (ExtendedBlocklet blocklet : prunedBlocklets) { long partitionId = CarbonTablePath.DataFileUtil .getTaskIdFromTaskNo(CarbonTablePath.DataFileUtil.getTaskNo(blocklet.getPath())); // OldPartitionIdList is only used in alter table partition command because it change // partition info first and then read data. // For other normal query should use newest partitionIdList if (partitionInfo != null && partitionInfo.getPartitionType() != PartitionType.NATIVE_HIVE) { if (oldPartitionIdList != null) { partitionIndex = oldPartitionIdList.indexOf((int) partitionId); } else { partitionIndex = partitionIdList.indexOf((int) partitionId); } } if (partitionIndex != -1) { // matchedPartitions variable will be null in two cases as follows // 1. the table is not a partition table // 2. the table is a partition table, and all partitions are matched by query // for partition table, the task id of carbaondata file name is the partition id. // if this partition is not required, here will skip it. if (matchedPartitions == null || matchedPartitions.get(partitionIndex)) { CarbonInputSplit inputSplit = convertToCarbonInputSplit(blocklet); if (inputSplit != null) { resultFilteredBlocks.add(inputSplit); } } } } statistic.addStatistics(QueryStatisticsConstants.LOAD_BLOCKS_DRIVER, System.currentTimeMillis()); recorder.recordStatisticsForDriver(statistic, job.getConfiguration().get("query.id")); return resultFilteredBlocks; }
From source file:org.apache.carbondata.hadoop.api.CarbonInputFormat.java
License:Apache License
/** * Prune the blocklets using the filter expression with available datamaps. * First pruned with default blocklet datamap, then pruned with CG and FG datamaps *///from w ww .j av a 2 s.com private List<ExtendedBlocklet> getPrunedBlocklets(JobContext job, CarbonTable carbonTable, Expression expression, List<Segment> segmentIds) throws IOException { ExplainCollector.addPruningInfo(carbonTable.getTableName()); FilterResolverIntf resolver = null; if (expression != null) { carbonTable.processFilterExpression(expression, null, null); resolver = CarbonTable.resolveFilter(expression, carbonTable.getAbsoluteTableIdentifier()); ExplainCollector.setFilterStatement(expression.getStatement()); } else { ExplainCollector.setFilterStatement("none"); } boolean distributedCG = Boolean.parseBoolean( CarbonProperties.getInstance().getProperty(CarbonCommonConstants.USE_DISTRIBUTED_DATAMAP, CarbonCommonConstants.USE_DISTRIBUTED_DATAMAP_DEFAULT)); DataMapJob dataMapJob = DataMapUtil.getDataMapJob(job.getConfiguration()); List<PartitionSpec> partitionsToPrune = getPartitionsToPrune(job.getConfiguration()); // First prune using default datamap on driver side. TableDataMap defaultDataMap = DataMapStoreManager.getInstance().getDefaultDataMap(carbonTable); List<ExtendedBlocklet> prunedBlocklets = null; // This is to log the event, so user will know what is happening by seeing logs. LOG.info("Started block pruning ..."); if (carbonTable.isTransactionalTable()) { prunedBlocklets = defaultDataMap.prune(segmentIds, resolver, partitionsToPrune); } else { prunedBlocklets = defaultDataMap.prune(segmentIds, expression, partitionsToPrune); } ExplainCollector.setDefaultDataMapPruningBlockHit(getBlockCount(prunedBlocklets)); if (prunedBlocklets.size() == 0) { return prunedBlocklets; } DataMapChooser chooser = new DataMapChooser(getOrCreateCarbonTable(job.getConfiguration())); // Get the available CG datamaps and prune further. DataMapExprWrapper cgDataMapExprWrapper = chooser.chooseCGDataMap(resolver); if (cgDataMapExprWrapper != null) { // Prune segments from already pruned blocklets pruneSegments(segmentIds, prunedBlocklets); List<ExtendedBlocklet> cgPrunedBlocklets; // Again prune with CG datamap. if (distributedCG && dataMapJob != null) { cgPrunedBlocklets = DataMapUtil.executeDataMapJob(carbonTable, resolver, segmentIds, cgDataMapExprWrapper, dataMapJob, partitionsToPrune); } else { cgPrunedBlocklets = cgDataMapExprWrapper.prune(segmentIds, partitionsToPrune); } // since index datamap prune in segment scope, // the result need to intersect with previous pruned result prunedBlocklets = intersectFilteredBlocklets(carbonTable, prunedBlocklets, cgPrunedBlocklets); ExplainCollector.recordCGDataMapPruning( DataMapWrapperSimpleInfo.fromDataMapWrapper(cgDataMapExprWrapper), prunedBlocklets.size(), getBlockCount(prunedBlocklets)); } if (prunedBlocklets.size() == 0) { return prunedBlocklets; } // Now try to prune with FG DataMap. if (isFgDataMapPruningEnable(job.getConfiguration()) && dataMapJob != null) { DataMapExprWrapper fgDataMapExprWrapper = chooser.chooseFGDataMap(resolver); if (fgDataMapExprWrapper != null) { // Prune segments from already pruned blocklets pruneSegments(segmentIds, prunedBlocklets); List<ExtendedBlocklet> fgPrunedBlocklets = DataMapUtil.executeDataMapJob(carbonTable, resolver, segmentIds, fgDataMapExprWrapper, dataMapJob, partitionsToPrune); // note that the 'fgPrunedBlocklets' has extra datamap related info compared with // 'prunedBlocklets', so the intersection should keep the elements in 'fgPrunedBlocklets' prunedBlocklets = intersectFilteredBlocklets(carbonTable, prunedBlocklets, fgPrunedBlocklets); ExplainCollector.recordFGDataMapPruning( DataMapWrapperSimpleInfo.fromDataMapWrapper(fgDataMapExprWrapper), prunedBlocklets.size(), getBlockCount(prunedBlocklets)); } } LOG.info("Finished block pruning ..."); return prunedBlocklets; }
From source file:org.apache.carbondata.hadoop.api.CarbonInputFormat.java
License:Apache License
@Override protected boolean isSplitable(JobContext context, Path filename) { try {/*from ww w . j a va2 s .c o m*/ // Don't split the file if it is local file system FileSystem fileSystem = filename.getFileSystem(context.getConfiguration()); if (fileSystem instanceof LocalFileSystem) { return false; } } catch (Exception e) { return true; } return true; }
From source file:org.apache.carbondata.hadoop.api.CarbonOutputCommitter.java
License:Apache License
/** * Update the tablestatus with inprogress while setup the job. * * @param context//from w w w. jav a 2s.co m * @throws IOException */ @Override public void setupJob(JobContext context) throws IOException { super.setupJob(context); boolean overwriteSet = CarbonTableOutputFormat.isOverwriteSet(context.getConfiguration()); CarbonLoadModel loadModel = CarbonTableOutputFormat.getLoadModel(context.getConfiguration()); if (loadModel.getSegmentId() == null) { CarbonLoaderUtil.readAndUpdateLoadProgressInTableMeta(loadModel, overwriteSet); } // Take segment lock segmentLock = CarbonLockFactory.getCarbonLockObj( loadModel.getCarbonDataLoadSchema().getCarbonTable().getAbsoluteTableIdentifier(), CarbonTablePath.addSegmentPrefix(loadModel.getSegmentId()) + LockUsage.LOCK); if (!segmentLock.lockWithRetries()) { throw new RuntimeException("Already segment is locked for loading, not supposed happen"); } CarbonTableOutputFormat.setLoadModel(context.getConfiguration(), loadModel); }
From source file:org.apache.carbondata.hadoop.api.CarbonOutputCommitter.java
License:Apache License
/** * Update the tablestatus as success after job is success * * @param context//from w w w . ja v a2s .c o m * @throws IOException */ @Override public void commitJob(JobContext context) throws IOException { try { super.commitJob(context); } catch (IOException e) { // ignore, in case of concurrent load it try to remove temporary folders by other load may // cause file not found exception. This will not impact carbon load, LOGGER.warn(e.getMessage()); } boolean overwriteSet = CarbonTableOutputFormat.isOverwriteSet(context.getConfiguration()); CarbonLoadModel loadModel = CarbonTableOutputFormat.getLoadModel(context.getConfiguration()); LoadMetadataDetails newMetaEntry = loadModel.getCurrentLoadMetadataDetail(); String readPath = CarbonTablePath.getSegmentFilesLocation(loadModel.getTablePath()) + CarbonCommonConstants.FILE_SEPARATOR + loadModel.getSegmentId() + "_" + loadModel.getFactTimeStamp() + ".tmp"; // Merge all partition files into a single file. String segmentFileName = SegmentFileStore.genSegmentFileName(loadModel.getSegmentId(), String.valueOf(loadModel.getFactTimeStamp())); SegmentFileStore.SegmentFile segmentFile = SegmentFileStore.mergeSegmentFiles(readPath, segmentFileName, CarbonTablePath.getSegmentFilesLocation(loadModel.getTablePath())); if (segmentFile != null) { if (null == newMetaEntry) { throw new RuntimeException("Internal Error"); } // Move all files from temp directory of each segment to partition directory SegmentFileStore.moveFromTempFolder(segmentFile, loadModel.getSegmentId() + "_" + loadModel.getFactTimeStamp() + ".tmp", loadModel.getTablePath()); newMetaEntry.setSegmentFile(segmentFileName + CarbonTablePath.SEGMENT_EXT); } OperationContext operationContext = (OperationContext) getOperationContext(); String uuid = ""; if (loadModel.getCarbonDataLoadSchema().getCarbonTable().isChildDataMap() && operationContext != null) { uuid = operationContext.getProperty("uuid").toString(); } CarbonLoaderUtil.populateNewLoadMetaEntry(newMetaEntry, SegmentStatus.SUCCESS, loadModel.getFactTimeStamp(), true); CarbonTable carbonTable = loadModel.getCarbonDataLoadSchema().getCarbonTable(); long segmentSize = CarbonLoaderUtil.addDataIndexSizeIntoMetaEntry(newMetaEntry, loadModel.getSegmentId(), carbonTable); if (segmentSize > 0 || overwriteSet) { if (operationContext != null && carbonTable.hasAggregationDataMap()) { operationContext.setProperty("current.segmentfile", newMetaEntry.getSegmentFile()); LoadEvents.LoadTablePreStatusUpdateEvent event = new LoadEvents.LoadTablePreStatusUpdateEvent( carbonTable.getCarbonTableIdentifier(), loadModel); try { OperationListenerBus.getInstance().fireEvent(event, operationContext); } catch (Exception e) { throw new IOException(e); } } String uniqueId = null; if (overwriteSet) { if (!loadModel.isCarbonTransactionalTable()) { CarbonLoaderUtil.deleteNonTransactionalTableForInsertOverwrite(loadModel); } else { if (segmentSize == 0) { newMetaEntry.setSegmentStatus(SegmentStatus.MARKED_FOR_DELETE); } uniqueId = overwritePartitions(loadModel, newMetaEntry, uuid); } } else { CarbonLoaderUtil.recordNewLoadMetadata(newMetaEntry, loadModel, false, false, uuid); } DataMapStatusManager.disableAllLazyDataMaps(carbonTable); if (operationContext != null) { LoadEvents.LoadTablePostStatusUpdateEvent postStatusUpdateEvent = new LoadEvents.LoadTablePostStatusUpdateEvent( loadModel); try { OperationListenerBus.getInstance().fireEvent(postStatusUpdateEvent, operationContext); } catch (Exception e) { throw new IOException(e); } } String updateTime = context.getConfiguration().get(CarbonTableOutputFormat.UPADTE_TIMESTAMP, null); String segmentsToBeDeleted = context.getConfiguration() .get(CarbonTableOutputFormat.SEGMENTS_TO_BE_DELETED, ""); List<Segment> segmentDeleteList = Segment.toSegmentList(segmentsToBeDeleted.split(","), null); Set<Segment> segmentSet = new HashSet<>( new SegmentStatusManager(carbonTable.getAbsoluteTableIdentifier(), context.getConfiguration()) .getValidAndInvalidSegments().getValidSegments()); if (updateTime != null) { CarbonUpdateUtil.updateTableMetadataStatus(segmentSet, carbonTable, updateTime, true, segmentDeleteList); } else if (uniqueId != null) { CarbonUpdateUtil.updateTableMetadataStatus(segmentSet, carbonTable, uniqueId, true, segmentDeleteList); } } else { CarbonLoaderUtil.updateTableStatusForFailure(loadModel); } if (segmentLock != null) { segmentLock.unlock(); } }
From source file:org.apache.carbondata.hadoop.api.CarbonOutputCommitter.java
License:Apache License
/** * Update the tablestatus as fail if any fail happens.And also clean up the temp folders if any * are existed./*ww w . j a va 2s . co m*/ * * @param context * @param state * @throws IOException */ @Override public void abortJob(JobContext context, JobStatus.State state) throws IOException { try { super.abortJob(context, state); CarbonLoadModel loadModel = CarbonTableOutputFormat.getLoadModel(context.getConfiguration()); CarbonLoaderUtil.updateTableStatusForFailure(loadModel); String segmentFileName = loadModel.getSegmentId() + "_" + loadModel.getFactTimeStamp(); LoadMetadataDetails metadataDetail = loadModel.getCurrentLoadMetadataDetail(); if (metadataDetail != null) { // In case the segment file is already created for this job then just link it so that it // will be used while cleaning. if (!metadataDetail.getSegmentStatus().equals(SegmentStatus.SUCCESS)) { String readPath = CarbonTablePath.getSegmentFilesLocation(loadModel.getTablePath()) + CarbonCommonConstants.FILE_SEPARATOR + segmentFileName + CarbonTablePath.SEGMENT_EXT; if (FileFactory.getCarbonFile(readPath).exists()) { metadataDetail.setSegmentFile(segmentFileName + CarbonTablePath.SEGMENT_EXT); } } } // Clean the temp files CarbonFile segTmpFolder = FileFactory .getCarbonFile(CarbonTablePath.getSegmentFilesLocation(loadModel.getTablePath()) + CarbonCommonConstants.FILE_SEPARATOR + segmentFileName + ".tmp"); // delete temp segment folder if (segTmpFolder.exists()) { FileFactory.deleteAllCarbonFilesOfDir(segTmpFolder); } CarbonFile segmentFilePath = FileFactory .getCarbonFile(CarbonTablePath.getSegmentFilesLocation(loadModel.getTablePath()) + CarbonCommonConstants.FILE_SEPARATOR + segmentFileName + CarbonTablePath.SEGMENT_EXT); // Delete the temp data folders of this job if exists if (segmentFilePath.exists()) { SegmentFileStore fileStore = new SegmentFileStore(loadModel.getTablePath(), segmentFileName + CarbonTablePath.SEGMENT_EXT); SegmentFileStore.removeTempFolder(fileStore.getLocationMap(), segmentFileName + ".tmp", loadModel.getTablePath()); } LOGGER.error("Loading failed with job status : " + state); } finally { if (segmentLock != null) { segmentLock.unlock(); } } }
From source file:org.apache.carbondata.hadoop.api.CarbonTableInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { // work as following steps: // get all current valid segment // for each segment, get all input split List<InputSplit> output = new LinkedList<>(); Expression filter = getFilter(job.getConfiguration()); Segment[] segments = segmentManager.getAllValidSegments(); FilterResolverIntf filterResolver = CarbonInputFormatUtil.resolveFilter(filter, null); for (Segment segment : segments) { List<InputSplit> splits = segment.getSplits(job, filterResolver); output.addAll(splits);/*from w w w . j a v a2 s.com*/ } return output; }
From source file:org.apache.carbondata.hadoop.CarbonInputFormat.java
License:Apache License
/** * {@inheritDoc}/*from w w w . j a v a 2 s. c o m*/ * Configurations FileInputFormat.INPUT_DIR * are used to get table path to read. * * @param job * @return List<InputSplit> list of CarbonInputSplit * @throws IOException */ @Override public List<InputSplit> getSplits(JobContext job) throws IOException { try { CarbonTable carbonTable = getCarbonTable(job.getConfiguration()); Object filterPredicates = getFilterPredicates(job.getConfiguration()); if (getValidSegments(job).length == 0) { // Get the valid segments from the carbon store. SegmentStatusManager.ValidSegmentsInfo validSegments = new SegmentStatusManager( getAbsoluteTableIdentifier(job.getConfiguration())).getValidSegments(); if (validSegments.listOfValidSegments.isEmpty()) { return new ArrayList<InputSplit>(); } setSegmentsToAccess(job.getConfiguration(), validSegments.listOfValidSegments); } if (filterPredicates == null) { return getSplitsInternal(job); } else { if (filterPredicates instanceof Expression) { //process and resolve the expression. CarbonInputFormatUtil.processFilterExpression((Expression) filterPredicates, carbonTable); return getSplits(job, CarbonInputFormatUtil.resolveFilter((Expression) filterPredicates, getAbsoluteTableIdentifier(job.getConfiguration()))); } else { //It means user sets already resolved expression. return getSplits(job, (FilterResolverIntf) filterPredicates); } } } catch (Exception ex) { throw new IOException(ex); } }
From source file:org.apache.carbondata.hadoop.CarbonInputFormat.java
License:Apache License
/** * {@inheritDoc}/*w ww .ja v a 2 s . c o m*/ * Configurations FileInputFormat.INPUT_DIR, CarbonInputFormat.INPUT_SEGMENT_NUMBERS * are used to get table path to read. * * @return * @throws IOException */ private List<InputSplit> getSplits(JobContext job, FilterResolverIntf filterResolver) throws IOException, IndexBuilderException { List<InputSplit> result = new LinkedList<InputSplit>(); FilterExpressionProcessor filterExpressionProcessor = new FilterExpressionProcessor(); AbsoluteTableIdentifier absoluteTableIdentifier = getAbsoluteTableIdentifier(job.getConfiguration()); //for each segment fetch blocks matching filter in Driver BTree for (String segmentNo : getValidSegments(job)) { List<DataRefNode> dataRefNodes = getDataBlocksOfSegment(job, filterExpressionProcessor, absoluteTableIdentifier, filterResolver, segmentNo); for (DataRefNode dataRefNode : dataRefNodes) { BlockBTreeLeafNode leafNode = (BlockBTreeLeafNode) dataRefNode; TableBlockInfo tableBlockInfo = leafNode.getTableBlockInfo(); result.add(new CarbonInputSplit(segmentNo, new Path(tableBlockInfo.getFilePath()), tableBlockInfo.getBlockOffset(), tableBlockInfo.getBlockLength(), tableBlockInfo.getLocations())); } } return result; }
From source file:org.apache.carbondata.hadoop.CarbonInputFormat.java
License:Apache License
/** * get total number of rows. Same as count(*) * * @throws IOException// www.ja v a 2s . c o m * @throws IndexBuilderException */ public long getRowCount(JobContext job) throws IOException, IndexBuilderException { long rowCount = 0; AbsoluteTableIdentifier absoluteTableIdentifier = getAbsoluteTableIdentifier(job.getConfiguration()); SegmentStatusManager.ValidSegmentsInfo validSegments = new SegmentStatusManager( getAbsoluteTableIdentifier(job.getConfiguration())).getValidSegments(); setSegmentsToAccess(job.getConfiguration(), validSegments.listOfValidSegments); // no of core to load the blocks in driver int numberOfCores = CarbonCommonConstants.NUMBER_OF_CORE_TO_LOAD_DRIVER_SEGMENT_DEFAULT_VALUE; try { numberOfCores = Integer.parseInt(CarbonProperties.getInstance() .getProperty(CarbonCommonConstants.NUMBER_OF_CORE_TO_LOAD_DRIVER_SEGMENT)); } catch (NumberFormatException e) { numberOfCores = CarbonCommonConstants.NUMBER_OF_CORE_TO_LOAD_DRIVER_SEGMENT_DEFAULT_VALUE; } // creating a thread pool ExecutorService threadPool = Executors.newFixedThreadPool(numberOfCores); List<Future<Map<String, AbstractIndex>>> loadedBlocks = new ArrayList<Future<Map<String, AbstractIndex>>>(); //for each segment fetch blocks matching filter in Driver BTree for (String segmentNo : getValidSegments(job)) { // submitting the task loadedBlocks.add(threadPool.submit(new BlocksLoaderThread(job, absoluteTableIdentifier, segmentNo))); } threadPool.shutdown(); try { threadPool.awaitTermination(1, TimeUnit.HOURS); } catch (InterruptedException e) { throw new IndexBuilderException(e); } try { // adding all the rows of the blocks to get the total row // count for (Future<Map<String, AbstractIndex>> block : loadedBlocks) { for (AbstractIndex abstractIndex : block.get().values()) { rowCount += abstractIndex.getTotalNumberOfRows(); } } } catch (InterruptedException | ExecutionException e) { throw new IndexBuilderException(e); } return rowCount; }