com.facebook.presto.tpch.GeneratingTpchDataFileLoader.java Source code

Java tutorial

Introduction

Here is the source code for com.facebook.presto.tpch.GeneratingTpchDataFileLoader.java

Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.tpch;

import com.facebook.presto.execution.TaskId;
import com.facebook.presto.metadata.ColumnFileHandle;
import com.facebook.presto.operator.OperatorContext;
import com.facebook.presto.operator.Page;
import com.facebook.presto.operator.RecordProjectOperator;
import com.facebook.presto.operator.TaskContext;
import com.facebook.presto.serde.BlocksFileEncoding;
import com.facebook.presto.spi.ColumnMetadata;
import com.facebook.presto.sql.analyzer.Session;
import com.facebook.presto.util.DelimitedRecordSet;
import com.facebook.presto.util.Threads;
import com.google.common.base.Splitter;
import com.google.common.base.Throwables;
import com.google.common.hash.Hashing;
import com.google.common.io.ByteStreams;
import com.google.common.io.Files;
import com.google.common.io.InputSupplier;
import com.google.common.io.Resources;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.jar.JarFile;

import static com.google.common.base.Charsets.UTF_8;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.io.CharStreams.newReaderSupplier;

/**
 * Extracts TPCH data into serialized column file formats.
 * It will also cache the extracted columns in the local file system to help mitigate the cost of the operation.
 */
public class GeneratingTpchDataFileLoader implements TpchDataFileLoader {
    private final TableInputSupplierFactory tableInputSupplierFactory;
    private final File cacheDirectory;

    public GeneratingTpchDataFileLoader(TableInputSupplierFactory tableInputSupplierFactory, File cacheDirectory) {
        checkNotNull(tableInputSupplierFactory, "tableInputStreamProvider is null");
        checkNotNull(cacheDirectory, "cacheDirectory is null");
        checkArgument(!cacheDirectory.exists() || cacheDirectory.isDirectory(),
                "cacheDirectory must be a directory");
        this.tableInputSupplierFactory = tableInputSupplierFactory;
        this.cacheDirectory = cacheDirectory;
    }

    public GeneratingTpchDataFileLoader(TableInputSupplierFactory tableInputSupplierFactory,
            String cacheDirectoryName) {
        this(tableInputSupplierFactory, new File(checkNotNull(cacheDirectoryName, "cacheDirectoryName is null")));
    }

    public GeneratingTpchDataFileLoader(String cacheDirectoryName) {
        this(autoSelectTableInputStreamProvider(), cacheDirectoryName);
    }

    public GeneratingTpchDataFileLoader() {
        this(System.getProperty("tpchCacheDir", "/tmp/tpchdatacache"));
    }

    private interface TableInputSupplierFactory {
        InputSupplier<InputStream> getInputSupplier(String tableName);
    }

    private static class JarTableInputSupplierFactory implements TableInputSupplierFactory {
        private final String jarFileName;

        private JarTableInputSupplierFactory(String jarFileName) {
            this.jarFileName = checkNotNull(jarFileName, "jarFileName is null");
        }

        @Override
        public InputSupplier<InputStream> getInputSupplier(final String tableName) {
            checkNotNull(tableName, "tableFileName is null");
            return new InputSupplier<InputStream>() {
                @Override
                public InputStream getInput() throws IOException {
                    try {
                        JarFile jarFile = new JarFile(jarFileName);
                        return jarFile.getInputStream(jarFile.getJarEntry(createTableFileName(tableName)));
                    } catch (IOException e) {
                        throw Throwables.propagate(e);
                    }
                }
            };
        }
    }

    private static class ResourcesTableInputSupplierFactory implements TableInputSupplierFactory {
        @Override
        public InputSupplier<InputStream> getInputSupplier(String tableName) {
            checkNotNull(tableName, "tableFileName is null");
            return Resources.newInputStreamSupplier(Resources.getResource(createTableFileName(tableName)));
        }
    }

    private static TableInputSupplierFactory autoSelectTableInputStreamProvider() {
        // First check if a data jar file has been manually specified
        final String tpchDataJarFileOverride = System.getProperty("tpchDataJar");
        if (tpchDataJarFileOverride != null) {
            return new JarTableInputSupplierFactory(tpchDataJarFileOverride);
        }
        // Otherwise fall back to the default in resources if one is available
        else {
            return new ResourcesTableInputSupplierFactory();
        }
    }

    @Override
    public File getDataFile(TpchTableHandle tableHandle, TpchColumnHandle columnHandle,
            BlocksFileEncoding encoding) {
        checkNotNull(tableHandle, "tableHandle is null");
        checkNotNull(columnHandle, "columnHandle is null");
        checkNotNull(encoding, "encoding is null");

        String tableName = tableHandle.getTableName();
        ExecutorService executor = Executors.newCachedThreadPool(Threads.daemonThreadsNamed("tpch-generate-%s"));
        try {
            String hash = ByteStreams
                    .hash(ByteStreams.slice(tableInputSupplierFactory.getInputSupplier(tableName), 0, 1024 * 1024),
                            Hashing.murmur3_32())
                    .toString();

            File cachedFile = new File(new File(cacheDirectory, tableName + "-" + hash),
                    "new-" + createFileName(columnHandle, encoding));
            if (cachedFile.exists()) {
                return cachedFile;
            }

            Files.createParentDirs(cachedFile);

            InputSupplier<InputStream> inputSupplier = tableInputSupplierFactory.getInputSupplier(tableName);

            ColumnMetadata columnMetadata = new TpchMetadata().getColumnMetadata(tableHandle, columnHandle);

            DelimitedRecordSet records = new DelimitedRecordSet(newReaderSupplier(inputSupplier, UTF_8),
                    Splitter.on("|"), columnMetadata);

            Session session = new Session("user", "source", "catalog", "schema", "address", "agent");
            OperatorContext operatorContext = new TaskContext(new TaskId("query", "stage", "task"), executor,
                    session).addPipelineContext(true, true).addDriverContext().addOperatorContext(0,
                            "tpch-generate");

            RecordProjectOperator source = new RecordProjectOperator(operatorContext, records);

            ColumnFileHandle columnFileHandle = ColumnFileHandle.builder(0)
                    .addColumn(columnHandle, cachedFile, encoding).build();

            while (!source.isFinished()) {
                Page page = source.getOutput();
                if (page != null) {
                    columnFileHandle.append(page);
                }
            }
            columnFileHandle.commit();

            return cachedFile;
        } catch (IOException e) {
            throw Throwables.propagate(e);
        } finally {
            executor.shutdownNow();
        }
    }

    private static String createTableFileName(String tableName) {
        return tableName + ".tbl";
    }

    private static String createFileName(TpchColumnHandle columnHandle, BlocksFileEncoding encoding) {
        return String.format("column%d.%s_%s.data", columnHandle.getFieldIndex(), columnHandle.getType(),
                encoding.getName());
    }
}