bulkload.UploadTuples.java Source code

Java tutorial

Introduction

Here is the source code for bulkload.UploadTuples.java

Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package bulkload;

import java.io.*;
import java.util.*;

import org.apache.cassandra.config.Config;
import org.apache.cassandra.dht.Murmur3Partitioner;
import org.apache.cassandra.exceptions.InvalidRequestException;
import org.apache.cassandra.io.sstable.CQLSSTableWriter;

/**
 * Usage: java bulkload.BulkLoad1
 */
public class UploadTuples {

    //////////////////////////////////////////////////////////////////////////////////////////////////////////////

    private static final String SCHEMA_1 = String.format("CREATE TABLE %s.%s (" + "id int, " + "lhs list<int>, "
            + "predicate int, " + "rhs list<int>, " + "PRIMARY KEY (id) " + ") ", Constants.KEYSPACE,
            Constants.CF_TUPLE);
    private static final String INSERT_STMT_1 = String.format(
            "INSERT INTO %s.%s (id, lhs, predicate, rhs) VALUES (?, ?, ?, ?)", Constants.KEYSPACE,
            Constants.CF_TUPLE);

    //////////////////////////////////////////////////////////////////////////////////////////////////////////////

    private static List<Integer> toIntList(String line) {
        List<Integer> list = new ArrayList<>();
        String[] parts = line.split(",");
        for (String part : parts) {
            int id = Integer.parseInt(part);
            list.add(id);
        }
        return list;
    }

    //////////////////////////////////////////////////////////////////////////////////////////////////////////////

    public static void main(String[] args) {
        if (args.length != 1) {
            System.out.println("usage: java bulkload.UploadTuples /path/to/free_base/data/");
            return;
        }

        String Freebase_base = args[0];

        // magic!
        Config.setClientMode(true);

        // create main table
        {
            // Create output directory that has keyspace and table name in the path
            File outputDir1 = new File(Constants.KEYSPACE + File.separator + Constants.CF_TUPLE);
            if (!outputDir1.exists() && !outputDir1.mkdirs()) {
                throw new RuntimeException("Cannot create output directory: " + outputDir1);
            }

            // Prepare SSTable writer
            CQLSSTableWriter.Builder builder = CQLSSTableWriter.builder();
            // set output directory
            builder.inDirectory(outputDir1)
                    // set target schema
                    .forTable(SCHEMA_1)
                    // set CQL statement to put data
                    .using(INSERT_STMT_1)
                    // set partitioner if needed
                    // default is Murmur3Partitioner so set if you use different one.
                    .withPartitioner(new Murmur3Partitioner());

            CQLSSTableWriter writer = builder.build();

            try (BufferedReader reader = new BufferedReader(new InputStreamReader(
                    new FileInputStream(Freebase_base + File.separator + "freebase_data.txt")))) {

                // Write to SSTable while reading data
                int id_counter = 1;
                int hm_counter = 1;
                String line;
                while ((line = reader.readLine()) != null) {
                    String[] parts = line.split("\\|");
                    if (parts.length == 3) {
                        List<Integer> lhs = toIntList(parts[0]);
                        List<Integer> rhs = toIntList(parts[2]);
                        int predicate = Integer.parseInt(parts[1]);

                        writer.addRow(id_counter, lhs, predicate, rhs);

                        id_counter += 1;

                        if ((id_counter % 10_000_000) == 0) {
                            System.out.println(id_counter);
                        }

                    }

                } // while loop

            } catch (InvalidRequestException | IOException e) {
                e.printStackTrace();
            }

            try {
                writer.close();
            } catch (IOException ignore) {
            }

        }

        System.out.println("done");
        String path = Constants.KEYSPACE + File.separator + Constants.CF_TUPLE;
        System.out.println("you can upload these files to Cassandra: sstableloader -d host " + path);
        System.exit(0);

    } // main

}