Example usage for org.apache.cassandra.hadoop ConfigHelper setInputSplitSize

Introduction

In this page you can find the example usage for org.apache.cassandra.hadoop ConfigHelper setInputSplitSize.

Prototype

public static void setInputSplitSize(Configuration conf, int splitsize)

Source Link

Document

Set the size of the input split.

Usage

From source file:com.dse.pig.udfs.CqlStorage.java

License:Apache License

/** set read configuration settings */
public void setLocation(String location, Job job) throws IOException {
    conf = job.getConfiguration();/*from  w ww  .j a v a  2  s .c  o  m*/
    setLocationFromUri(location);

    if (username != null && password != null)
        ConfigHelper.setInputKeyspaceUserNameAndPassword(conf, username, password);
    if (splitSize > 0)
        ConfigHelper.setInputSplitSize(conf, splitSize);
    if (partitionerClass != null)
        ConfigHelper.setInputPartitioner(conf, partitionerClass);
    if (rpcPort != null)
        ConfigHelper.setInputRpcPort(conf, rpcPort);
    if (initHostAddress != null)
        ConfigHelper.setInputInitialAddress(conf, initHostAddress);

    ConfigHelper.setInputColumnFamily(conf, keyspace, column_family);
    setConnectionInformation();

    CqlConfigHelper.setInputCQLPageRowSize(conf, String.valueOf(pageSize));
    if (columns != null && !columns.trim().isEmpty())
        CqlConfigHelper.setInputColumns(conf, columns);

    String whereClauseForPartitionFilter = getWhereClauseForPartitionFilter();
    String wc = whereClause != null && !whereClause.trim().isEmpty()
            ? whereClauseForPartitionFilter == null ? whereClause
                    : String.format("%s AND %s", whereClause.trim(), whereClauseForPartitionFilter)
            : whereClauseForPartitionFilter;

    if (wc != null) {
        logger.debug("where clause: {}", wc);
        CqlConfigHelper.setInputWhereClauses(conf, wc);
    }

    if (System.getenv(PIG_INPUT_SPLIT_SIZE) != null) {
        try {
            ConfigHelper.setInputSplitSize(conf, Integer.valueOf(System.getenv(PIG_INPUT_SPLIT_SIZE)));
        } catch (NumberFormatException e) {
            throw new IOException("PIG_INPUT_SPLIT_SIZE is not a number", e);
        }
    }

    if (ConfigHelper.getInputRpcPort(conf) == 0)
        throw new IOException("PIG_INPUT_RPC_PORT or PIG_RPC_PORT environment variable not set");
    if (ConfigHelper.getInputInitialAddress(conf) == null)
        throw new IOException("PIG_INPUT_INITIAL_ADDRESS or PIG_INITIAL_ADDRESS environment variable not set");
    if (ConfigHelper.getInputPartitioner(conf) == null)
        throw new IOException("PIG_INPUT_PARTITIONER or PIG_PARTITIONER environment variable not set");
    if (loadSignature == null)
        loadSignature = location;

    initSchema(loadSignature);
}

From source file:com.dse.pig.udfs.CqlStorage.java

License:Apache License

/** set store configuration settings */
public void setStoreLocation(String location, Job job) throws IOException {
    conf = job.getConfiguration();/*www.  ja  v  a  2  s  . com*/
    setLocationFromUri(location);

    if (username != null && password != null)
        ConfigHelper.setOutputKeyspaceUserNameAndPassword(conf, username, password);
    if (splitSize > 0)
        ConfigHelper.setInputSplitSize(conf, splitSize);
    if (partitionerClass != null)
        ConfigHelper.setOutputPartitioner(conf, partitionerClass);
    if (rpcPort != null) {
        ConfigHelper.setOutputRpcPort(conf, rpcPort);
        ConfigHelper.setInputRpcPort(conf, rpcPort);
    }
    if (initHostAddress != null) {
        ConfigHelper.setOutputInitialAddress(conf, initHostAddress);
        ConfigHelper.setInputInitialAddress(conf, initHostAddress);
    }

    ConfigHelper.setOutputColumnFamily(conf, keyspace, column_family);
    CqlConfigHelper.setOutputCql(conf, outputQuery);

    setConnectionInformation();

    if (ConfigHelper.getOutputRpcPort(conf) == 0)
        throw new IOException("PIG_OUTPUT_RPC_PORT or PIG_RPC_PORT environment variable not set");
    if (ConfigHelper.getOutputInitialAddress(conf) == null)
        throw new IOException("PIG_OUTPUT_INITIAL_ADDRESS or PIG_INITIAL_ADDRESS environment variable not set");
    if (ConfigHelper.getOutputPartitioner(conf) == null)
        throw new IOException("PIG_OUTPUT_PARTITIONER or PIG_PARTITIONER environment variable not set");

    initSchema(storeSignature);
}

From source file:org.apache.hadoop.hive.cassandra.input.cql.HiveCqlInputFormat.java

License:Apache License

@Override
public RecordReader<MapWritableComparable, MapWritable> getRecordReader(InputSplit split, JobConf jobConf,
        final Reporter reporter) throws IOException {
    HiveCassandraStandardSplit cassandraSplit = (HiveCassandraStandardSplit) split;

    List<String> columns = CqlSerDe.parseColumnMapping(cassandraSplit.getColumnMapping());

    List<Integer> readColIDs = ColumnProjectionUtils.getReadColumnIDs(jobConf);

    if (columns.size() < readColIDs.size()) {
        throw new IOException("Cannot read more columns than the given table contains.");
    }//from  ww  w  . j av a 2  s .  c  om

    ColumnFamilySplit cfSplit = cassandraSplit.getSplit();
    Job job = new Job(jobConf);

    TaskAttemptContext tac = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()) {
        @Override
        public void progress() {
            reporter.progress();
        }
    };

    SlicePredicate predicate = new SlicePredicate();

    predicate.setColumn_names(getColumnNames(columns, readColIDs));

    try {

        boolean wideRows = true;

        ConfigHelper.setInputColumnFamily(tac.getConfiguration(), cassandraSplit.getKeyspace(),
                cassandraSplit.getColumnFamily(), wideRows);

        ConfigHelper.setInputSlicePredicate(tac.getConfiguration(), predicate);
        ConfigHelper.setRangeBatchSize(tac.getConfiguration(), cassandraSplit.getRangeBatchSize());
        ConfigHelper.setInputRpcPort(tac.getConfiguration(), cassandraSplit.getPort() + "");
        ConfigHelper.setInputInitialAddress(tac.getConfiguration(), cassandraSplit.getHost());
        ConfigHelper.setInputPartitioner(tac.getConfiguration(), cassandraSplit.getPartitioner());
        // Set Split Size
        ConfigHelper.setInputSplitSize(tac.getConfiguration(), cassandraSplit.getSplitSize());

        LOG.info("Validators : " + tac.getConfiguration().get(CassandraColumnSerDe.CASSANDRA_VALIDATOR_TYPE));
        List<IndexExpression> indexExpr = parseFilterPredicate(jobConf);
        if (indexExpr != null) {
            //We have pushed down a filter from the Hive query, we can use this against secondary indexes
            ConfigHelper.setInputRange(tac.getConfiguration(), indexExpr);
        }

        CqlHiveRecordReader rr = new CqlHiveRecordReader(new CqlPagingRecordReader());

        rr.initialize(cfSplit, tac);

        return rr;

    } catch (Exception ie) {
        throw new IOException(ie);
    }
}

From source file:org.apache.hadoop.hive.cassandra.input.cql.HiveCqlInputFormat.java

License:Apache License

@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
    String ks = jobConf.get(AbstractCassandraSerDe.CASSANDRA_KEYSPACE_NAME);
    String cf = jobConf.get(AbstractCassandraSerDe.CASSANDRA_CF_NAME);
    int slicePredicateSize = jobConf.getInt(AbstractCassandraSerDe.CASSANDRA_SLICE_PREDICATE_SIZE,
            AbstractCassandraSerDe.DEFAULT_SLICE_PREDICATE_SIZE);
    int sliceRangeSize = jobConf.getInt(AbstractCassandraSerDe.CASSANDRA_RANGE_BATCH_SIZE,
            AbstractCassandraSerDe.DEFAULT_RANGE_BATCH_SIZE);
    int splitSize = jobConf.getInt(AbstractCassandraSerDe.CASSANDRA_SPLIT_SIZE,
            AbstractCassandraSerDe.DEFAULT_SPLIT_SIZE);
    String cassandraColumnMapping = jobConf.get(AbstractCassandraSerDe.CASSANDRA_COL_MAPPING);
    int rpcPort = jobConf.getInt(AbstractCassandraSerDe.CASSANDRA_PORT, 9160);
    String host = jobConf.get(AbstractCassandraSerDe.CASSANDRA_HOST);
    String partitioner = jobConf.get(AbstractCassandraSerDe.CASSANDRA_PARTITIONER);

    if (cassandraColumnMapping == null) {
        throw new IOException("cassandra.columns.mapping required for Cassandra Table.");
    }/* w w w .j  av  a2 s.c  om*/

    SliceRange range = new SliceRange();
    range.setStart(new byte[0]);
    range.setFinish(new byte[0]);
    range.setReversed(false);
    range.setCount(slicePredicateSize);
    SlicePredicate predicate = new SlicePredicate();
    predicate.setSlice_range(range);

    ConfigHelper.setInputRpcPort(jobConf, "" + rpcPort);
    ConfigHelper.setInputInitialAddress(jobConf, host);
    ConfigHelper.setInputPartitioner(jobConf, partitioner);
    ConfigHelper.setInputSlicePredicate(jobConf, predicate);
    ConfigHelper.setInputColumnFamily(jobConf, ks, cf);
    ConfigHelper.setRangeBatchSize(jobConf, sliceRangeSize);
    ConfigHelper.setInputSplitSize(jobConf, splitSize);

    Job job = new Job(jobConf);
    JobContext jobContext = new JobContext(job.getConfiguration(), job.getJobID());

    Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);
    List<org.apache.hadoop.mapreduce.InputSplit> splits = getSplits(jobContext);
    InputSplit[] results = new InputSplit[splits.size()];

    for (int i = 0; i < splits.size(); ++i) {
        HiveCassandraStandardSplit csplit = new HiveCassandraStandardSplit((ColumnFamilySplit) splits.get(i),
                cassandraColumnMapping, tablePaths[0]);
        csplit.setKeyspace(ks);
        csplit.setColumnFamily(cf);
        csplit.setRangeBatchSize(sliceRangeSize);
        csplit.setSplitSize(splitSize);
        csplit.setHost(host);
        csplit.setPort(rpcPort);
        csplit.setSlicePredicateSize(slicePredicateSize);
        csplit.setPartitioner(partitioner);
        csplit.setColumnMapping(cassandraColumnMapping);
        results[i] = csplit;
    }
    return results;
}

From source file:org.apache.hadoop.hive.cassandra.input.HiveCassandraStandardColumnInputFormat.java

License:Apache License

@Override
public RecordReader<BytesWritable, MapWritable> getRecordReader(InputSplit split, JobConf jobConf,
        final Reporter reporter) throws IOException {
    HiveCassandraStandardSplit cassandraSplit = (HiveCassandraStandardSplit) split;

    List<String> columns = CassandraColumnSerDe.parseColumnMapping(cassandraSplit.getColumnMapping());
    isTransposed = CassandraColumnSerDe.isTransposed(columns);

    List<Integer> readColIDs = ColumnProjectionUtils.getReadColumnIDs(jobConf);

    if (columns.size() < readColIDs.size()) {
        throw new IOException("Cannot read more columns than the given table contains.");
    }/*from w  w  w  .  jav  a  2 s  .  co m*/

    org.apache.cassandra.hadoop.ColumnFamilySplit cfSplit = cassandraSplit.getSplit();
    Job job = new Job(jobConf);

    TaskAttemptContext tac = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()) {
        @Override
        public void progress() {
            reporter.progress();
        }
    };

    SlicePredicate predicate = new SlicePredicate();

    if (isTransposed || readColIDs.size() == columns.size() || readColIDs.size() == 0) {
        SliceRange range = new SliceRange();
        AbstractType comparator = BytesType.instance;

        String comparatorType = jobConf.get(AbstractCassandraSerDe.CASSANDRA_SLICE_PREDICATE_RANGE_COMPARATOR);
        if (comparatorType != null && !comparatorType.equals("")) {
            try {
                comparator = TypeParser.parse(comparatorType);
            } catch (ConfigurationException ex) {
                throw new IOException("Comparator class not found.");
            } catch (SyntaxException e) {
                throw new IOException(e);
            }
        }

        String sliceStart = jobConf.get(AbstractCassandraSerDe.CASSANDRA_SLICE_PREDICATE_RANGE_START);
        String sliceEnd = jobConf.get(AbstractCassandraSerDe.CASSANDRA_SLICE_PREDICATE_RANGE_FINISH);
        String reversed = jobConf.get(AbstractCassandraSerDe.CASSANDRA_SLICE_PREDICATE_RANGE_REVERSED);

        range.setStart(comparator.fromString(sliceStart == null ? "" : sliceStart));
        range.setFinish(comparator.fromString(sliceEnd == null ? "" : sliceEnd));
        range.setReversed(reversed == null ? false : reversed.equals("true"));
        range.setCount(cassandraSplit.getSlicePredicateSize());
        predicate.setSlice_range(range);
    } else {
        int iKey = columns.indexOf(CassandraColumnSerDe.CASSANDRA_KEY_COLUMN);
        predicate.setColumn_names(getColumnNames(iKey, columns, readColIDs));
    }

    try {

        boolean wideRows = false;
        if (isTransposed && tac.getConfiguration()
                .getBoolean(CassandraColumnSerDe.CASSANDRA_ENABLE_WIDEROW_ITERATOR, true)) {
            wideRows = true;
        }

        ConfigHelper.setInputColumnFamily(tac.getConfiguration(), cassandraSplit.getKeyspace(),
                cassandraSplit.getColumnFamily(), wideRows);

        ConfigHelper.setInputSlicePredicate(tac.getConfiguration(), predicate);
        ConfigHelper.setRangeBatchSize(tac.getConfiguration(), cassandraSplit.getRangeBatchSize());
        ConfigHelper.setInputRpcPort(tac.getConfiguration(), cassandraSplit.getPort() + "");
        ConfigHelper.setInputInitialAddress(tac.getConfiguration(), cassandraSplit.getHost());
        ConfigHelper.setInputPartitioner(tac.getConfiguration(), cassandraSplit.getPartitioner());
        // Set Split Size
        ConfigHelper.setInputSplitSize(tac.getConfiguration(), cassandraSplit.getSplitSize());

        LOG.info("Validators : " + tac.getConfiguration().get(CassandraColumnSerDe.CASSANDRA_VALIDATOR_TYPE));
        List<IndexExpression> indexExpr = parseFilterPredicate(jobConf);
        if (indexExpr != null) {
            //We have pushed down a filter from the Hive query, we can use this against secondary indexes
            ConfigHelper.setInputRange(tac.getConfiguration(), indexExpr);
        }

        CassandraHiveRecordReader rr = new CassandraHiveRecordReader(new ColumnFamilyRecordReader(),
                isTransposed);

        rr.initialize(cfSplit, tac);

        return rr;

    } catch (Exception ie) {
        throw new IOException(ie);
    }
}