List of usage examples for org.apache.cassandra.hadoop ConfigHelper setInputSplitSize
public static void setInputSplitSize(Configuration conf, int splitsize)
From source file:com.dse.pig.udfs.CqlStorage.java
License:Apache License
/** set read configuration settings */ public void setLocation(String location, Job job) throws IOException { conf = job.getConfiguration();/*from w ww .j a v a 2 s .c o m*/ setLocationFromUri(location); if (username != null && password != null) ConfigHelper.setInputKeyspaceUserNameAndPassword(conf, username, password); if (splitSize > 0) ConfigHelper.setInputSplitSize(conf, splitSize); if (partitionerClass != null) ConfigHelper.setInputPartitioner(conf, partitionerClass); if (rpcPort != null) ConfigHelper.setInputRpcPort(conf, rpcPort); if (initHostAddress != null) ConfigHelper.setInputInitialAddress(conf, initHostAddress); ConfigHelper.setInputColumnFamily(conf, keyspace, column_family); setConnectionInformation(); CqlConfigHelper.setInputCQLPageRowSize(conf, String.valueOf(pageSize)); if (columns != null && !columns.trim().isEmpty()) CqlConfigHelper.setInputColumns(conf, columns); String whereClauseForPartitionFilter = getWhereClauseForPartitionFilter(); String wc = whereClause != null && !whereClause.trim().isEmpty() ? whereClauseForPartitionFilter == null ? whereClause : String.format("%s AND %s", whereClause.trim(), whereClauseForPartitionFilter) : whereClauseForPartitionFilter; if (wc != null) { logger.debug("where clause: {}", wc); CqlConfigHelper.setInputWhereClauses(conf, wc); } if (System.getenv(PIG_INPUT_SPLIT_SIZE) != null) { try { ConfigHelper.setInputSplitSize(conf, Integer.valueOf(System.getenv(PIG_INPUT_SPLIT_SIZE))); } catch (NumberFormatException e) { throw new IOException("PIG_INPUT_SPLIT_SIZE is not a number", e); } } if (ConfigHelper.getInputRpcPort(conf) == 0) throw new IOException("PIG_INPUT_RPC_PORT or PIG_RPC_PORT environment variable not set"); if (ConfigHelper.getInputInitialAddress(conf) == null) throw new IOException("PIG_INPUT_INITIAL_ADDRESS or PIG_INITIAL_ADDRESS environment variable not set"); if (ConfigHelper.getInputPartitioner(conf) == null) throw new IOException("PIG_INPUT_PARTITIONER or PIG_PARTITIONER environment variable not set"); if (loadSignature == null) loadSignature = location; initSchema(loadSignature); }
From source file:com.dse.pig.udfs.CqlStorage.java
License:Apache License
/** set store configuration settings */ public void setStoreLocation(String location, Job job) throws IOException { conf = job.getConfiguration();/*www. ja v a 2 s . com*/ setLocationFromUri(location); if (username != null && password != null) ConfigHelper.setOutputKeyspaceUserNameAndPassword(conf, username, password); if (splitSize > 0) ConfigHelper.setInputSplitSize(conf, splitSize); if (partitionerClass != null) ConfigHelper.setOutputPartitioner(conf, partitionerClass); if (rpcPort != null) { ConfigHelper.setOutputRpcPort(conf, rpcPort); ConfigHelper.setInputRpcPort(conf, rpcPort); } if (initHostAddress != null) { ConfigHelper.setOutputInitialAddress(conf, initHostAddress); ConfigHelper.setInputInitialAddress(conf, initHostAddress); } ConfigHelper.setOutputColumnFamily(conf, keyspace, column_family); CqlConfigHelper.setOutputCql(conf, outputQuery); setConnectionInformation(); if (ConfigHelper.getOutputRpcPort(conf) == 0) throw new IOException("PIG_OUTPUT_RPC_PORT or PIG_RPC_PORT environment variable not set"); if (ConfigHelper.getOutputInitialAddress(conf) == null) throw new IOException("PIG_OUTPUT_INITIAL_ADDRESS or PIG_INITIAL_ADDRESS environment variable not set"); if (ConfigHelper.getOutputPartitioner(conf) == null) throw new IOException("PIG_OUTPUT_PARTITIONER or PIG_PARTITIONER environment variable not set"); initSchema(storeSignature); }
From source file:org.apache.hadoop.hive.cassandra.input.cql.HiveCqlInputFormat.java
License:Apache License
@Override public RecordReader<MapWritableComparable, MapWritable> getRecordReader(InputSplit split, JobConf jobConf, final Reporter reporter) throws IOException { HiveCassandraStandardSplit cassandraSplit = (HiveCassandraStandardSplit) split; List<String> columns = CqlSerDe.parseColumnMapping(cassandraSplit.getColumnMapping()); List<Integer> readColIDs = ColumnProjectionUtils.getReadColumnIDs(jobConf); if (columns.size() < readColIDs.size()) { throw new IOException("Cannot read more columns than the given table contains."); }//from ww w . j av a 2 s . c om ColumnFamilySplit cfSplit = cassandraSplit.getSplit(); Job job = new Job(jobConf); TaskAttemptContext tac = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()) { @Override public void progress() { reporter.progress(); } }; SlicePredicate predicate = new SlicePredicate(); predicate.setColumn_names(getColumnNames(columns, readColIDs)); try { boolean wideRows = true; ConfigHelper.setInputColumnFamily(tac.getConfiguration(), cassandraSplit.getKeyspace(), cassandraSplit.getColumnFamily(), wideRows); ConfigHelper.setInputSlicePredicate(tac.getConfiguration(), predicate); ConfigHelper.setRangeBatchSize(tac.getConfiguration(), cassandraSplit.getRangeBatchSize()); ConfigHelper.setInputRpcPort(tac.getConfiguration(), cassandraSplit.getPort() + ""); ConfigHelper.setInputInitialAddress(tac.getConfiguration(), cassandraSplit.getHost()); ConfigHelper.setInputPartitioner(tac.getConfiguration(), cassandraSplit.getPartitioner()); // Set Split Size ConfigHelper.setInputSplitSize(tac.getConfiguration(), cassandraSplit.getSplitSize()); LOG.info("Validators : " + tac.getConfiguration().get(CassandraColumnSerDe.CASSANDRA_VALIDATOR_TYPE)); List<IndexExpression> indexExpr = parseFilterPredicate(jobConf); if (indexExpr != null) { //We have pushed down a filter from the Hive query, we can use this against secondary indexes ConfigHelper.setInputRange(tac.getConfiguration(), indexExpr); } CqlHiveRecordReader rr = new CqlHiveRecordReader(new CqlPagingRecordReader()); rr.initialize(cfSplit, tac); return rr; } catch (Exception ie) { throw new IOException(ie); } }
From source file:org.apache.hadoop.hive.cassandra.input.cql.HiveCqlInputFormat.java
License:Apache License
@Override public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException { String ks = jobConf.get(AbstractCassandraSerDe.CASSANDRA_KEYSPACE_NAME); String cf = jobConf.get(AbstractCassandraSerDe.CASSANDRA_CF_NAME); int slicePredicateSize = jobConf.getInt(AbstractCassandraSerDe.CASSANDRA_SLICE_PREDICATE_SIZE, AbstractCassandraSerDe.DEFAULT_SLICE_PREDICATE_SIZE); int sliceRangeSize = jobConf.getInt(AbstractCassandraSerDe.CASSANDRA_RANGE_BATCH_SIZE, AbstractCassandraSerDe.DEFAULT_RANGE_BATCH_SIZE); int splitSize = jobConf.getInt(AbstractCassandraSerDe.CASSANDRA_SPLIT_SIZE, AbstractCassandraSerDe.DEFAULT_SPLIT_SIZE); String cassandraColumnMapping = jobConf.get(AbstractCassandraSerDe.CASSANDRA_COL_MAPPING); int rpcPort = jobConf.getInt(AbstractCassandraSerDe.CASSANDRA_PORT, 9160); String host = jobConf.get(AbstractCassandraSerDe.CASSANDRA_HOST); String partitioner = jobConf.get(AbstractCassandraSerDe.CASSANDRA_PARTITIONER); if (cassandraColumnMapping == null) { throw new IOException("cassandra.columns.mapping required for Cassandra Table."); }/* w w w .j av a2 s.c om*/ SliceRange range = new SliceRange(); range.setStart(new byte[0]); range.setFinish(new byte[0]); range.setReversed(false); range.setCount(slicePredicateSize); SlicePredicate predicate = new SlicePredicate(); predicate.setSlice_range(range); ConfigHelper.setInputRpcPort(jobConf, "" + rpcPort); ConfigHelper.setInputInitialAddress(jobConf, host); ConfigHelper.setInputPartitioner(jobConf, partitioner); ConfigHelper.setInputSlicePredicate(jobConf, predicate); ConfigHelper.setInputColumnFamily(jobConf, ks, cf); ConfigHelper.setRangeBatchSize(jobConf, sliceRangeSize); ConfigHelper.setInputSplitSize(jobConf, splitSize); Job job = new Job(jobConf); JobContext jobContext = new JobContext(job.getConfiguration(), job.getJobID()); Path[] tablePaths = FileInputFormat.getInputPaths(jobContext); List<org.apache.hadoop.mapreduce.InputSplit> splits = getSplits(jobContext); InputSplit[] results = new InputSplit[splits.size()]; for (int i = 0; i < splits.size(); ++i) { HiveCassandraStandardSplit csplit = new HiveCassandraStandardSplit((ColumnFamilySplit) splits.get(i), cassandraColumnMapping, tablePaths[0]); csplit.setKeyspace(ks); csplit.setColumnFamily(cf); csplit.setRangeBatchSize(sliceRangeSize); csplit.setSplitSize(splitSize); csplit.setHost(host); csplit.setPort(rpcPort); csplit.setSlicePredicateSize(slicePredicateSize); csplit.setPartitioner(partitioner); csplit.setColumnMapping(cassandraColumnMapping); results[i] = csplit; } return results; }
From source file:org.apache.hadoop.hive.cassandra.input.HiveCassandraStandardColumnInputFormat.java
License:Apache License
@Override public RecordReader<BytesWritable, MapWritable> getRecordReader(InputSplit split, JobConf jobConf, final Reporter reporter) throws IOException { HiveCassandraStandardSplit cassandraSplit = (HiveCassandraStandardSplit) split; List<String> columns = CassandraColumnSerDe.parseColumnMapping(cassandraSplit.getColumnMapping()); isTransposed = CassandraColumnSerDe.isTransposed(columns); List<Integer> readColIDs = ColumnProjectionUtils.getReadColumnIDs(jobConf); if (columns.size() < readColIDs.size()) { throw new IOException("Cannot read more columns than the given table contains."); }/*from w w w . jav a 2 s . co m*/ org.apache.cassandra.hadoop.ColumnFamilySplit cfSplit = cassandraSplit.getSplit(); Job job = new Job(jobConf); TaskAttemptContext tac = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()) { @Override public void progress() { reporter.progress(); } }; SlicePredicate predicate = new SlicePredicate(); if (isTransposed || readColIDs.size() == columns.size() || readColIDs.size() == 0) { SliceRange range = new SliceRange(); AbstractType comparator = BytesType.instance; String comparatorType = jobConf.get(AbstractCassandraSerDe.CASSANDRA_SLICE_PREDICATE_RANGE_COMPARATOR); if (comparatorType != null && !comparatorType.equals("")) { try { comparator = TypeParser.parse(comparatorType); } catch (ConfigurationException ex) { throw new IOException("Comparator class not found."); } catch (SyntaxException e) { throw new IOException(e); } } String sliceStart = jobConf.get(AbstractCassandraSerDe.CASSANDRA_SLICE_PREDICATE_RANGE_START); String sliceEnd = jobConf.get(AbstractCassandraSerDe.CASSANDRA_SLICE_PREDICATE_RANGE_FINISH); String reversed = jobConf.get(AbstractCassandraSerDe.CASSANDRA_SLICE_PREDICATE_RANGE_REVERSED); range.setStart(comparator.fromString(sliceStart == null ? "" : sliceStart)); range.setFinish(comparator.fromString(sliceEnd == null ? "" : sliceEnd)); range.setReversed(reversed == null ? false : reversed.equals("true")); range.setCount(cassandraSplit.getSlicePredicateSize()); predicate.setSlice_range(range); } else { int iKey = columns.indexOf(CassandraColumnSerDe.CASSANDRA_KEY_COLUMN); predicate.setColumn_names(getColumnNames(iKey, columns, readColIDs)); } try { boolean wideRows = false; if (isTransposed && tac.getConfiguration() .getBoolean(CassandraColumnSerDe.CASSANDRA_ENABLE_WIDEROW_ITERATOR, true)) { wideRows = true; } ConfigHelper.setInputColumnFamily(tac.getConfiguration(), cassandraSplit.getKeyspace(), cassandraSplit.getColumnFamily(), wideRows); ConfigHelper.setInputSlicePredicate(tac.getConfiguration(), predicate); ConfigHelper.setRangeBatchSize(tac.getConfiguration(), cassandraSplit.getRangeBatchSize()); ConfigHelper.setInputRpcPort(tac.getConfiguration(), cassandraSplit.getPort() + ""); ConfigHelper.setInputInitialAddress(tac.getConfiguration(), cassandraSplit.getHost()); ConfigHelper.setInputPartitioner(tac.getConfiguration(), cassandraSplit.getPartitioner()); // Set Split Size ConfigHelper.setInputSplitSize(tac.getConfiguration(), cassandraSplit.getSplitSize()); LOG.info("Validators : " + tac.getConfiguration().get(CassandraColumnSerDe.CASSANDRA_VALIDATOR_TYPE)); List<IndexExpression> indexExpr = parseFilterPredicate(jobConf); if (indexExpr != null) { //We have pushed down a filter from the Hive query, we can use this against secondary indexes ConfigHelper.setInputRange(tac.getConfiguration(), indexExpr); } CassandraHiveRecordReader rr = new CassandraHiveRecordReader(new ColumnFamilyRecordReader(), isTransposed); rr.initialize(cfSplit, tac); return rr; } catch (Exception ie) { throw new IOException(ie); } }