org.apache.carbondata.datamap.lucene.LuceneDataMapWriter.java Source code

Introduction

Here is the source code for org.apache.carbondata.datamap.lucene.LuceneDataMapWriter.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.carbondata.datamap.lucene;

import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.carbondata.common.annotations.InterfaceAudience;
import org.apache.carbondata.common.logging.LogServiceFactory;
import org.apache.carbondata.core.constants.CarbonCommonConstants;
import org.apache.carbondata.core.datamap.Segment;
import org.apache.carbondata.core.datamap.dev.DataMapWriter;
import org.apache.carbondata.core.datastore.impl.FileFactory;
import org.apache.carbondata.core.datastore.page.ColumnPage;
import org.apache.carbondata.core.metadata.datatype.DataType;
import org.apache.carbondata.core.metadata.datatype.DataTypes;
import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn;
import org.apache.carbondata.core.util.CarbonProperties;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
import org.apache.lucene.codecs.lucene62.Lucene62Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoublePoint;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FloatPoint;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.IntRangeField;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.solr.store.hdfs.HdfsDirectory;
import org.roaringbitmap.IntIterator;
import org.roaringbitmap.RoaringBitmap;

/**
 * Implementation to write lucene index while loading
 */
@InterfaceAudience.Internal
public class LuceneDataMapWriter extends DataMapWriter {
    /**
     * logger
     */
    private static final Logger LOGGER = LogServiceFactory.getLogService(LuceneDataMapWriter.class.getName());

    /**
     * index writer
     */
    private IndexWriter indexWriter = null;

    private Analyzer analyzer = null;

    public static final String PAGEID_NAME = "pageId";

    public static final String ROWID_NAME = "rowId";

    private Codec speedCodec = new Lucene62Codec(Lucene50StoredFieldsFormat.Mode.BEST_SPEED);

    private Codec compressionCodec = new Lucene62Codec(Lucene50StoredFieldsFormat.Mode.BEST_COMPRESSION);

    private Map<LuceneColumnKeys, Map<Integer, RoaringBitmap>> cache = new HashMap<>();

    private int cacheSize;

    private ByteBuffer intBuffer = ByteBuffer.allocate(4);

    private boolean storeBlockletWise;

    LuceneDataMapWriter(String tablePath, String dataMapName, List<CarbonColumn> indexColumns, Segment segment,
            String shardName, int flushSize, boolean storeBlockletWise) {
        super(tablePath, dataMapName, indexColumns, segment, shardName);
        this.cacheSize = flushSize;
        this.storeBlockletWise = storeBlockletWise;
    }

    /**
     * Start of new block notification.
     */
    public void onBlockStart(String blockId) throws IOException {

    }

    /**
     * End of block notification
     */
    public void onBlockEnd(String blockId) throws IOException {

    }

    private RAMDirectory ramDir;
    private IndexWriter ramIndexWriter;

    /**
     * Start of new blocklet notification.
     */
    public void onBlockletStart(int blockletId) throws IOException {
        if (null == analyzer) {
            if (CarbonProperties.getInstance()
                    .getProperty(CarbonCommonConstants.CARBON_LUCENE_INDEX_STOP_WORDS,
                            CarbonCommonConstants.CARBON_LUCENE_INDEX_STOP_WORDS_DEFAULT)
                    .equalsIgnoreCase("true")) {
                analyzer = new StandardAnalyzer(CharArraySet.EMPTY_SET);
            } else {
                analyzer = new StandardAnalyzer();
            }
        }
        // save index data into ram, write into disk after one page finished
        ramDir = new RAMDirectory();
        ramIndexWriter = new IndexWriter(ramDir, new IndexWriterConfig(analyzer));

        if (indexWriter != null) {
            return;
        }
        // get index path, put index data into segment's path
        String dataMapPath;
        if (storeBlockletWise) {
            dataMapPath = this.dataMapPath + File.separator + blockletId;
        } else {
            dataMapPath = this.dataMapPath;
        }
        Path indexPath = FileFactory.getPath(dataMapPath);
        FileSystem fs = FileFactory.getFileSystem(indexPath);

        // if index path not exists, create it
        if (!fs.exists(indexPath)) {
            if (!fs.mkdirs(indexPath)) {
                throw new IOException("Failed to create directory " + dataMapPath);
            }
        }

        // the indexWriter closes the FileSystem on closing the writer, so for a new configuration
        // and disable the cache for the index writer, it will be closed on closing the writer
        Configuration conf = FileFactory.getConfiguration();
        conf.set("fs.hdfs.impl.disable.cache", "true");

        // create a index writer
        Directory indexDir = new HdfsDirectory(indexPath, conf);

        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
        if (CarbonProperties.getInstance()
                .getProperty(CarbonCommonConstants.CARBON_LUCENE_COMPRESSION_MODE,
                        CarbonCommonConstants.CARBON_LUCENE_COMPRESSION_MODE_DEFAULT)
                .equalsIgnoreCase(CarbonCommonConstants.CARBON_LUCENE_COMPRESSION_MODE_DEFAULT)) {
            indexWriterConfig.setCodec(speedCodec);
        } else {
            indexWriterConfig.setCodec(compressionCodec);
        }

        indexWriter = new IndexWriter(indexDir, indexWriterConfig);
    }

    /**
     * End of blocklet notification
     */
    public void onBlockletEnd(int blockletId) throws IOException {
        // close ram writer
        ramIndexWriter.close();

        // add ram index data into disk
        indexWriter.addIndexes(ramDir);

        // delete this ram data
        ramDir.close();

        if (storeBlockletWise) {
            flushCache(cache, getIndexColumns(), indexWriter, storeBlockletWise);
            indexWriter.close();
            indexWriter = null;
        }
    }

    /**
     * Add the column pages row to the datamap, order of pages is same as `indexColumns` in
     * DataMapMeta returned in DataMapFactory.
     * Implementation should copy the content of `pages` as needed, because `pages` memory
     * may be freed after this method returns, if using unsafe column page.
     */
    public void onPageAdded(int blockletId, int pageId, int pageSize, ColumnPage[] pages) throws IOException {
        // save index data into ram, write into disk after one page finished
        int columnsCount = pages.length;
        if (columnsCount <= 0) {
            LOGGER.warn(
                    "No data in the page " + pageId + "with blockletid " + blockletId + " to write lucene datamap");
            return;
        }
        for (int rowId = 0; rowId < pageSize; rowId++) {
            // add indexed columns value into the document
            LuceneColumnKeys columns = new LuceneColumnKeys(getIndexColumns().size());
            int i = 0;
            for (ColumnPage page : pages) {
                if (!page.getNullBits().get(rowId)) {
                    columns.colValues[i++] = getValue(page, rowId);
                }
            }
            if (cacheSize > 0) {
                addToCache(columns, rowId, pageId, blockletId, cache, intBuffer, storeBlockletWise);
            } else {
                addData(columns, rowId, pageId, blockletId, intBuffer, ramIndexWriter, getIndexColumns(),
                        storeBlockletWise);
            }
        }
        if (cacheSize > 0) {
            flushCacheIfPossible();
        }
    }

    private static void addField(Document doc, Object key, String fieldName, Field.Store store) {
        //get field name
        if (key instanceof Byte) {
            // byte type , use int range to deal with byte, lucene has no byte type
            byte value = (Byte) key;
            IntRangeField field = new IntRangeField(fieldName, new int[] { Byte.MIN_VALUE },
                    new int[] { Byte.MAX_VALUE });
            field.setIntValue(value);
            doc.add(field);

            // if need store it , add StoredField
            if (store == Field.Store.YES) {
                doc.add(new StoredField(fieldName, (int) value));
            }
        } else if (key instanceof Short) {
            // short type , use int range to deal with short type, lucene has no short type
            short value = (Short) key;
            IntRangeField field = new IntRangeField(fieldName, new int[] { Short.MIN_VALUE },
                    new int[] { Short.MAX_VALUE });
            field.setShortValue(value);
            doc.add(field);

            // if need store it , add StoredField
            if (store == Field.Store.YES) {
                doc.add(new StoredField(fieldName, (int) value));
            }
        } else if (key instanceof Integer) {
            // int type , use int point to deal with int type
            int value = (Integer) key;
            doc.add(new IntPoint(fieldName, new int[] { value }));

            // if need store it , add StoredField
            if (store == Field.Store.YES) {
                doc.add(new StoredField(fieldName, value));
            }
        } else if (key instanceof Long) {
            // long type , use long point to deal with long type
            long value = (Long) key;
            doc.add(new LongPoint(fieldName, new long[] { value }));

            // if need store it , add StoredField
            if (store == Field.Store.YES) {
                doc.add(new StoredField(fieldName, value));
            }
        } else if (key instanceof Float) {
            float value = (Float) key;
            doc.add(new FloatPoint(fieldName, new float[] { value }));
            if (store == Field.Store.YES) {
                doc.add(new FloatPoint(fieldName, value));
            }
        } else if (key instanceof Double) {
            double value = (Double) key;
            doc.add(new DoublePoint(fieldName, new double[] { value }));
            if (store == Field.Store.YES) {
                doc.add(new DoublePoint(fieldName, value));
            }
        } else if (key instanceof String) {
            String strValue = (String) key;
            doc.add(new TextField(fieldName, strValue, store));
        } else if (key instanceof Boolean) {
            boolean value = (Boolean) key;
            IntRangeField field = new IntRangeField(fieldName, new int[] { 0 }, new int[] { 1 });
            field.setIntValue(value ? 1 : 0);
            doc.add(field);
            if (store == Field.Store.YES) {
                doc.add(new StoredField(fieldName, value ? 1 : 0));
            }
        }
    }

    private Object getValue(ColumnPage page, int rowId) {

        //get field type
        DataType type = page.getColumnSpec().getSchemaDataType();
        Object value = null;
        if (type == DataTypes.BYTE) {
            // byte type , use int range to deal with byte, lucene has no byte type
            value = page.getByte(rowId);
        } else if (type == DataTypes.SHORT) {
            // short type , use int range to deal with short type, lucene has no short type
            value = page.getShort(rowId);
        } else if (type == DataTypes.INT) {
            // int type , use int point to deal with int type
            value = page.getInt(rowId);
        } else if (type == DataTypes.LONG) {
            // long type , use long point to deal with long type
            value = page.getLong(rowId);
        } else if (type == DataTypes.FLOAT) {
            value = page.getFloat(rowId);
        } else if (type == DataTypes.DOUBLE) {
            value = page.getDouble(rowId);
        } else if (type == DataTypes.STRING) {
            byte[] bytes = page.getBytes(rowId);
            try {
                value = new String(bytes, 2, bytes.length - 2, "UTF-8");
            } catch (UnsupportedEncodingException e) {
                throw new RuntimeException(e);
            }
        } else if (type == DataTypes.DATE) {
            throw new RuntimeException("unsupported data type " + type);
        } else if (type == DataTypes.TIMESTAMP) {
            throw new RuntimeException("unsupported data type " + type);
        } else if (type == DataTypes.BOOLEAN) {
            value = page.getBoolean(rowId);
        } else {
            LOGGER.error("unsupported data type " + type);
            throw new RuntimeException("unsupported data type " + type);
        }
        return value;
    }

    public static void addToCache(LuceneColumnKeys key, int rowId, int pageId, int blockletId,
            Map<LuceneColumnKeys, Map<Integer, RoaringBitmap>> cache, ByteBuffer intBuffer,
            boolean storeBlockletWise) {
        Map<Integer, RoaringBitmap> setMap = cache.get(key);
        if (setMap == null) {
            setMap = new HashMap<>();
            cache.put(key, setMap);
        }
        int combinKey;
        if (!storeBlockletWise) {
            intBuffer.clear();
            intBuffer.putShort((short) blockletId);
            intBuffer.putShort((short) pageId);
            intBuffer.rewind();
            combinKey = intBuffer.getInt();
        } else {
            combinKey = pageId;
        }
        RoaringBitmap bitSet = setMap.get(combinKey);
        if (bitSet == null) {
            bitSet = new RoaringBitmap();
            setMap.put(combinKey, bitSet);
        }
        bitSet.add(rowId);
    }

    public static void addData(LuceneColumnKeys key, int rowId, int pageId, int blockletId, ByteBuffer intBuffer,
            IndexWriter indexWriter, List<CarbonColumn> indexCols, boolean storeBlockletWise) throws IOException {

        Document document = new Document();
        for (int i = 0; i < key.getColValues().length; i++) {
            addField(document, key.getColValues()[i], indexCols.get(i).getColName(), Field.Store.NO);
        }
        intBuffer.clear();
        if (storeBlockletWise) {
            // No need to store blocklet id to it.
            intBuffer.putShort((short) pageId);
            intBuffer.putShort((short) rowId);
            intBuffer.rewind();
            document.add(new StoredField(ROWID_NAME, intBuffer.getInt()));
        } else {
            intBuffer.putShort((short) blockletId);
            intBuffer.putShort((short) pageId);
            intBuffer.rewind();
            document.add(new StoredField(PAGEID_NAME, intBuffer.getInt()));
            document.add(new StoredField(ROWID_NAME, (short) rowId));
        }
        indexWriter.addDocument(document);
    }

    private void flushCacheIfPossible() throws IOException {
        if (cache.size() > cacheSize) {
            flushCache(cache, getIndexColumns(), indexWriter, storeBlockletWise);
        }
    }

    public static void flushCache(Map<LuceneColumnKeys, Map<Integer, RoaringBitmap>> cache,
            List<CarbonColumn> indexCols, IndexWriter indexWriter, boolean storeBlockletWise) throws IOException {
        for (Map.Entry<LuceneColumnKeys, Map<Integer, RoaringBitmap>> entry : cache.entrySet()) {
            Document document = new Document();
            LuceneColumnKeys key = entry.getKey();
            for (int i = 0; i < key.getColValues().length; i++) {
                addField(document, key.getColValues()[i], indexCols.get(i).getColName(), Field.Store.NO);
            }
            Map<Integer, RoaringBitmap> value = entry.getValue();
            int count = 0;
            for (Map.Entry<Integer, RoaringBitmap> pageData : value.entrySet()) {
                RoaringBitmap bitMap = pageData.getValue();
                int cardinality = bitMap.getCardinality();
                // Each row is short and pageid is stored in int
                ByteBuffer byteBuffer = ByteBuffer.allocate(cardinality * 2 + 4);
                if (!storeBlockletWise) {
                    byteBuffer.putInt(pageData.getKey());
                } else {
                    byteBuffer.putShort(pageData.getKey().shortValue());
                }
                IntIterator intIterator = bitMap.getIntIterator();
                while (intIterator.hasNext()) {
                    byteBuffer.putShort((short) intIterator.next());
                }
                document.add(new StoredField(PAGEID_NAME + count, byteBuffer.array()));
                count++;
            }
            indexWriter.addDocument(document);
        }
        cache.clear();
    }

    /**
     * This is called during closing of writer.So after this call no more data will be sent to this
     * class.
     */
    public void finish() throws IOException {
        if (!isWritingFinished()) {
            flushCache(cache, getIndexColumns(), indexWriter, storeBlockletWise);
            // finished a file , close this index writer
            if (indexWriter != null) {
                indexWriter.close();
                indexWriter = null;
            }
            setWritingFinished(true);
        }
    }

    /**
     * Keeps column values of a single row.
     */
    public static class LuceneColumnKeys {

        private Object[] colValues;

        public LuceneColumnKeys(int size) {
            colValues = new Object[size];
        }

        public Object[] getColValues() {
            return colValues;
        }

        @Override
        public boolean equals(Object o) {
            if (this == o)
                return true;
            if (o == null || getClass() != o.getClass())
                return false;
            LuceneColumnKeys that = (LuceneColumnKeys) o;
            return Arrays.equals(colValues, that.colValues);
        }

        @Override
        public int hashCode() {
            return Arrays.hashCode(colValues);
        }
    }
}