Java tutorial
/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.core.segment.creator.impl; import com.linkedin.pinot.common.data.FieldSpec; import com.linkedin.pinot.common.data.Schema; import com.linkedin.pinot.common.data.StarTreeIndexSpec; import com.linkedin.pinot.core.data.GenericRow; import com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig; import com.linkedin.pinot.core.segment.creator.ColumnIndexCreationInfo; import com.linkedin.pinot.core.segment.creator.ForwardIndexCreator; import com.linkedin.pinot.core.segment.creator.InvertedIndexCreator; import com.linkedin.pinot.core.segment.creator.MultiValueForwardIndexCreator; import com.linkedin.pinot.core.segment.creator.SegmentCreator; import com.linkedin.pinot.core.segment.creator.SegmentIndexCreationInfo; import com.linkedin.pinot.core.segment.creator.SingleValueForwardIndexCreator; import com.linkedin.pinot.core.segment.creator.impl.fwd.MultiValueUnsortedForwardIndexCreator; import com.linkedin.pinot.core.segment.creator.impl.fwd.SingleValueSortedForwardIndexCreator; import com.linkedin.pinot.core.segment.creator.impl.fwd.SingleValueUnsortedForwardIndexCreator; import com.linkedin.pinot.core.segment.creator.impl.inv.OffHeapBitmapInvertedIndexCreator; import com.linkedin.pinot.core.startree.hll.HllConfig; import java.io.File; import java.io.IOException; import java.util.HashMap; import java.util.Map; import org.apache.commons.configuration.ConfigurationException; import org.apache.commons.configuration.PropertiesConfiguration; import org.apache.commons.lang.StringEscapeUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static com.linkedin.pinot.core.segment.creator.impl.V1Constants.MetadataKeys.Column.*; import static com.linkedin.pinot.core.segment.creator.impl.V1Constants.MetadataKeys.Segment.*; import static com.linkedin.pinot.core.segment.creator.impl.V1Constants.MetadataKeys.StarTree.*; /** * Segment creator which writes data in a columnar form. * * Nov 9, 2014 */ public class SegmentColumnarIndexCreator implements SegmentCreator { private Logger LOGGER = LoggerFactory.getLogger(SegmentColumnarIndexCreator.class); // TODO Refactor class name to match interface name private SegmentGeneratorConfig config; private Map<String, ColumnIndexCreationInfo> indexCreationInfoMap; private Map<String, SegmentDictionaryCreator> dictionaryCreatorMap; private Map<String, ForwardIndexCreator> forwardIndexCreatorMap; private Map<String, InvertedIndexCreator> invertedIndexCreatorMap; private String segmentName; private Schema schema; private File file; private int totalDocs; private int totalRawDocs; private int totalAggDocs; private int totalErrors; private int totalNulls; private int totalConversions; private int totalNullCols; private int docIdCounter; private char paddingCharacter; private Map<String, Map<Object, Object>> dictionaryCache = new HashMap<String, Map<Object, Object>>(); @Override public void init(SegmentGeneratorConfig segmentCreationSpec, SegmentIndexCreationInfo segmentIndexCreationInfo, Map<String, ColumnIndexCreationInfo> indexCreationInfoMap, Schema schema, File outDir) throws Exception { docIdCounter = 0; config = segmentCreationSpec; this.indexCreationInfoMap = indexCreationInfoMap; dictionaryCreatorMap = new HashMap<String, SegmentDictionaryCreator>(); forwardIndexCreatorMap = new HashMap<String, ForwardIndexCreator>(); this.indexCreationInfoMap = indexCreationInfoMap; invertedIndexCreatorMap = new HashMap<String, InvertedIndexCreator>(); file = outDir; // Check that the output directory does not exist if (file.exists()) { throw new RuntimeException("Segment output directory " + file.getAbsolutePath() + " already exists."); } file.mkdir(); this.schema = schema; this.totalDocs = segmentIndexCreationInfo.getTotalDocs(); this.totalAggDocs = segmentIndexCreationInfo.getTotalAggDocs(); this.totalRawDocs = segmentIndexCreationInfo.getTotalRawDocs(); this.totalErrors = segmentIndexCreationInfo.getTotalErrors(); this.totalNulls = segmentIndexCreationInfo.getTotalNulls(); this.totalConversions = segmentIndexCreationInfo.getTotalConversions(); this.totalNullCols = segmentIndexCreationInfo.getTotalNullCols(); this.paddingCharacter = segmentCreationSpec.getPaddingCharacter(); // Initialize and build dictionaries for (final FieldSpec spec : schema.getAllFieldSpecs()) { final ColumnIndexCreationInfo info = indexCreationInfoMap.get(spec.getName()); if (info.isCreateDictionary()) { dictionaryCreatorMap.put(spec.getName(), new SegmentDictionaryCreator(info.hasNulls(), info.getSortedUniqueElementsArray(), spec, file, paddingCharacter)); } else { throw new RuntimeException("Creation of indices without dictionaries is not implemented!"); } } // For each column, build its dictionary and initialize a forwards and an inverted index for (final String column : dictionaryCreatorMap.keySet()) { ColumnIndexCreationInfo indexCreationInfo = indexCreationInfoMap.get(column); boolean[] isSorted = new boolean[1]; isSorted[0] = indexCreationInfo.isSorted(); dictionaryCreatorMap.get(column).build(isSorted); indexCreationInfo.setSorted(isSorted[0]); dictionaryCache.put(column, new HashMap<Object, Object>()); int uniqueValueCount = indexCreationInfo.getDistinctValueCount(); if (schema.getFieldSpecFor(column).isSingleValueField()) { if (indexCreationInfo.isSorted()) { forwardIndexCreatorMap.put(column, new SingleValueSortedForwardIndexCreator(file, uniqueValueCount, schema.getFieldSpecFor(column))); } else { forwardIndexCreatorMap.put(column, new SingleValueUnsortedForwardIndexCreator(schema.getFieldSpecFor(column), file, uniqueValueCount, totalDocs, indexCreationInfo.getTotalNumberOfEntries(), indexCreationInfo.hasNulls())); } } else { forwardIndexCreatorMap.put(column, new MultiValueUnsortedForwardIndexCreator(schema.getFieldSpecFor(column), file, uniqueValueCount, totalDocs, indexCreationInfo.getTotalNumberOfEntries(), indexCreationInfo.hasNulls())); } } for (String column : config.getInvertedIndexCreationColumns()) { if (!schema.hasColumn(column)) { LOGGER.warn("Skipping enabling index on column:{} since its missing in schema", column); continue; } ColumnIndexCreationInfo indexCreationInfo = indexCreationInfoMap.get(column); int uniqueValueCount = indexCreationInfo.getDistinctValueCount(); OffHeapBitmapInvertedIndexCreator invertedIndexCreator = new OffHeapBitmapInvertedIndexCreator(file, uniqueValueCount, totalDocs, indexCreationInfo.getTotalNumberOfEntries(), schema.getFieldSpecFor(column)); invertedIndexCreatorMap.put(column, invertedIndexCreator); } } @Override public void indexRow(GenericRow row) { for (final String column : dictionaryCreatorMap.keySet()) { try { Object columnValueToIndex = row.getValue(column); if (columnValueToIndex == null) { throw new RuntimeException("Null value for column:" + column); } if (schema.getFieldSpecFor(column).isSingleValueField()) { int dictionaryIndex = dictionaryCreatorMap.get(column).indexOfSV(columnValueToIndex); ((SingleValueForwardIndexCreator) forwardIndexCreatorMap.get(column)).index(docIdCounter, dictionaryIndex); // TODO : {refactor inverted index addition} if (invertedIndexCreatorMap.containsKey(column)) { invertedIndexCreatorMap.get(column).add(docIdCounter, dictionaryIndex); } } else { int[] dictionaryIndex = dictionaryCreatorMap.get(column).indexOfMV(columnValueToIndex); ((MultiValueForwardIndexCreator) forwardIndexCreatorMap.get(column)).index(docIdCounter, dictionaryIndex); // TODO : {refactor inverted index addition} if (invertedIndexCreatorMap.containsKey(column)) { invertedIndexCreatorMap.get(column).add(docIdCounter, dictionaryIndex); } } } catch (Exception e) { throw new RuntimeException("Exception while indexing column:" + column, e); } } docIdCounter++; } @Override public void setSegmentName(String segmentName) { this.segmentName = segmentName; } @Override public void seal() throws ConfigurationException, IOException { for (final String column : forwardIndexCreatorMap.keySet()) { forwardIndexCreatorMap.get(column).close(); dictionaryCreatorMap.get(column).close(); } // The map is only initialized for columns that have inverted index creation enabled. for (final String invertedColumn : invertedIndexCreatorMap.keySet()) { invertedIndexCreatorMap.get(invertedColumn).seal(); } writeMetadata(); } void writeMetadata() throws ConfigurationException { PropertiesConfiguration properties = new PropertiesConfiguration( new File(file, V1Constants.MetadataKeys.METADATA_FILE_NAME)); properties.setProperty(SEGMENT_CREATOR_VERSION, config.getCreatorVersion()); properties.setProperty(SEGMENT_PADDING_CHARACTER, StringEscapeUtils.escapeJava(Character.toString(config.getPaddingCharacter()))); properties.setProperty(SEGMENT_NAME, segmentName); properties.setProperty(TABLE_NAME, config.getTableName()); properties.setProperty(DIMENSIONS, config.getDimensions()); properties.setProperty(METRICS, config.getMetrics()); properties.setProperty(TIME_COLUMN_NAME, config.getTimeColumnName()); properties.setProperty(TIME_INTERVAL, "not_there"); properties.setProperty(SEGMENT_TOTAL_RAW_DOCS, String.valueOf(totalRawDocs)); properties.setProperty(SEGMENT_TOTAL_AGGREGATE_DOCS, String.valueOf(totalAggDocs)); properties.setProperty(SEGMENT_TOTAL_DOCS, String.valueOf(totalDocs)); properties.setProperty(STAR_TREE_ENABLED, String.valueOf(config.isEnableStarTreeIndex())); properties.setProperty(SEGMENT_TOTAL_ERRORS, String.valueOf(totalErrors)); properties.setProperty(SEGMENT_TOTAL_NULLS, String.valueOf(totalNulls)); properties.setProperty(SEGMENT_TOTAL_CONVERSIONS, String.valueOf(totalConversions)); properties.setProperty(SEGMENT_TOTAL_NULL_COLS, String.valueOf(totalNullCols)); StarTreeIndexSpec starTreeIndexSpec = config.getStarTreeIndexSpec(); if (starTreeIndexSpec != null) { properties.setProperty(STAR_TREE_SPLIT_ORDER, starTreeIndexSpec.getDimensionsSplitOrder()); properties.setProperty(STAR_TREE_MAX_LEAF_RECORDS, starTreeIndexSpec.getMaxLeafRecords()); properties.setProperty(STAR_TREE_SKIP_STAR_NODE_CREATION_FOR_DIMENSIONS, starTreeIndexSpec.getSkipStarNodeCreationForDimensions()); properties.setProperty(STAR_TREE_SKIP_MATERIALIZATION_CARDINALITY, starTreeIndexSpec.getskipMaterializationCardinalityThreshold()); properties.setProperty(STAR_TREE_SKIP_MATERIALIZATION_FOR_DIMENSIONS, starTreeIndexSpec.getskipMaterializationForDimensions()); } HllConfig hllConfig = config.getHllConfig(); Map<String, String> derivedHllFieldToOriginMap = null; if (hllConfig != null) { properties.setProperty(SEGMENT_HLL_LOG2M, hllConfig.getHllLog2m()); derivedHllFieldToOriginMap = hllConfig.getDerivedHllFieldToOriginMap(); } String timeColumn = config.getTimeColumnName(); if (indexCreationInfoMap.get(timeColumn) != null) { properties.setProperty(SEGMENT_START_TIME, indexCreationInfoMap.get(timeColumn).getMin()); properties.setProperty(SEGMENT_END_TIME, indexCreationInfoMap.get(timeColumn).getMax()); properties.setProperty(TIME_UNIT, config.getSegmentTimeUnit()); } if (config.containsCustomProperty(SEGMENT_START_TIME)) { properties.setProperty(SEGMENT_START_TIME, config.getStartTime()); } if (config.containsCustomProperty(SEGMENT_END_TIME)) { properties.setProperty(SEGMENT_END_TIME, config.getEndTime()); } if (config.containsCustomProperty(TIME_UNIT)) { properties.setProperty(TIME_UNIT, config.getSegmentTimeUnit()); } for (Map.Entry<String, String> entry : config.getCustomProperties().entrySet()) { properties.setProperty(entry.getKey(), entry.getValue()); } for (Map.Entry<String, ColumnIndexCreationInfo> entry : indexCreationInfoMap.entrySet()) { String column = entry.getKey(); ColumnIndexCreationInfo columnIndexCreationInfo = entry.getValue(); int dictionaryElementSize = dictionaryCreatorMap.get(column).getStringColumnMaxLength(); // TODO: after fixing the server-side dependency on HAS_INVERTED_INDEX and deployed, set HAS_INVERTED_INDEX properly // The hasInvertedIndex flag in segment metadata is picked up in ColumnMetadata, and will be used during the query // plan phase. If it is set to false, then inverted indexes are not used in queries even if they are created via table // configs on segment load. So, we set it to true here for now, until we fix the server to update the value inside // ColumnMetadata, export information to the query planner that the inverted index available is current and can be used. // // boolean hasInvertedIndex = invertedIndexCreatorMap.containsKey(); boolean hasInvertedIndex = true; String hllOriginColumn = null; if (derivedHllFieldToOriginMap != null) { hllOriginColumn = derivedHllFieldToOriginMap.get(column); } addColumnMetadataInfo(properties, column, columnIndexCreationInfo, totalDocs, totalRawDocs, totalAggDocs, schema.getFieldSpecFor(column), dictionaryElementSize, hasInvertedIndex, hllOriginColumn); } properties.save(); } public static void addColumnMetadataInfo(PropertiesConfiguration properties, String column, ColumnIndexCreationInfo columnIndexCreationInfo, int totalDocs, int totalRawDocs, int totalAggDocs, FieldSpec fieldSpec, int dictionaryElementSize, boolean hasInvertedIndex, String hllOriginColumn) { int distinctValueCount = columnIndexCreationInfo.getDistinctValueCount(); properties.setProperty(getKeyFor(column, CARDINALITY), String.valueOf(distinctValueCount)); properties.setProperty(getKeyFor(column, TOTAL_DOCS), String.valueOf(totalDocs)); properties.setProperty(getKeyFor(column, TOTAL_RAW_DOCS), String.valueOf(totalRawDocs)); properties.setProperty(getKeyFor(column, TOTAL_AGG_DOCS), String.valueOf(totalAggDocs)); properties.setProperty(getKeyFor(column, DATA_TYPE), String.valueOf(fieldSpec.getDataType())); properties.setProperty(getKeyFor(column, BITS_PER_ELEMENT), String.valueOf(SingleValueUnsortedForwardIndexCreator.getNumOfBits(distinctValueCount))); properties.setProperty(getKeyFor(column, DICTIONARY_ELEMENT_SIZE), String.valueOf(dictionaryElementSize)); properties.setProperty(getKeyFor(column, COLUMN_TYPE), String.valueOf(fieldSpec.getFieldType())); properties.setProperty(getKeyFor(column, IS_SORTED), String.valueOf(columnIndexCreationInfo.isSorted())); properties.setProperty(getKeyFor(column, HAS_NULL_VALUE), String.valueOf(columnIndexCreationInfo.hasNulls())); properties.setProperty(getKeyFor(column, HAS_DICTIONARY), String.valueOf(columnIndexCreationInfo.isCreateDictionary())); properties.setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column, HAS_INVERTED_INDEX), String.valueOf(hasInvertedIndex)); properties.setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column, IS_SINGLE_VALUED), String.valueOf(fieldSpec.isSingleValueField())); properties.setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column, MAX_MULTI_VALUE_ELEMTS), String.valueOf(columnIndexCreationInfo.getMaxNumberOfMultiValueElements())); properties.setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column, TOTAL_NUMBER_OF_ENTRIES), String.valueOf(columnIndexCreationInfo.getTotalNumberOfEntries())); properties.setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column, IS_AUTO_GENERATED), String.valueOf(columnIndexCreationInfo.isAutoGenerated())); // HLL derived fields if (hllOriginColumn != null) { properties.setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column, ORIGIN_COLUMN), hllOriginColumn); properties.setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column, DERIVED_METRIC_TYPE), "HLL"); } Object defaultNullValue = columnIndexCreationInfo.getDefaultNullValue(); if (defaultNullValue == null) { defaultNullValue = fieldSpec.getDefaultNullValue(); } properties.setProperty(V1Constants.MetadataKeys.Column.getKeyFor(column, DEFAULT_NULL_VALUE), String.valueOf(defaultNullValue)); } public static void removeColumnMetadataInfo(PropertiesConfiguration properties, String column) { properties.clearProperty(getKeyFor(column, CARDINALITY)); properties.clearProperty(getKeyFor(column, TOTAL_DOCS)); properties.clearProperty(getKeyFor(column, TOTAL_RAW_DOCS)); properties.clearProperty(getKeyFor(column, TOTAL_AGG_DOCS)); properties.clearProperty(getKeyFor(column, DATA_TYPE)); properties.clearProperty(getKeyFor(column, BITS_PER_ELEMENT)); properties.clearProperty(getKeyFor(column, DICTIONARY_ELEMENT_SIZE)); properties.clearProperty(getKeyFor(column, COLUMN_TYPE)); properties.clearProperty(getKeyFor(column, IS_SORTED)); properties.clearProperty(getKeyFor(column, HAS_NULL_VALUE)); properties.clearProperty(getKeyFor(column, HAS_DICTIONARY)); properties.clearProperty(getKeyFor(column, HAS_INVERTED_INDEX)); properties.clearProperty(getKeyFor(column, IS_SINGLE_VALUED)); properties.clearProperty(getKeyFor(column, MAX_MULTI_VALUE_ELEMTS)); properties.clearProperty(getKeyFor(column, TOTAL_NUMBER_OF_ENTRIES)); properties.clearProperty(getKeyFor(column, IS_AUTO_GENERATED)); properties.clearProperty(getKeyFor(column, DEFAULT_NULL_VALUE)); } }