List of usage examples for org.apache.cassandra.hadoop ConfigHelper setInputPartitioner
public static void setInputPartitioner(Configuration conf, String classname)
From source file:WordCountCounters.java
License:Apache License
public int run(String[] args) throws Exception { Job job = new Job(getConf(), "wordcountcounters"); job.setJarByClass(WordCountCounters.class); job.setMapperClass(SumMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX)); job.setInputFormatClass(ColumnFamilyInputFormat.class); ConfigHelper.setInputRpcPort(job.getConfiguration(), "9160"); ConfigHelper.setInputInitialAddress(job.getConfiguration(), "localhost"); ConfigHelper.setInputPartitioner(job.getConfiguration(), "org.apache.cassandra.dht.Murmur3Partitioner"); ConfigHelper.setInputColumnFamily(job.getConfiguration(), WordCount.KEYSPACE, WordCountCounters.COUNTER_COLUMN_FAMILY); SlicePredicate predicate = new SlicePredicate() .setSlice_range(new SliceRange().setStart(ByteBufferUtil.EMPTY_BYTE_BUFFER) .setFinish(ByteBufferUtil.EMPTY_BYTE_BUFFER).setCount(100)); ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate); job.waitForCompletion(true);// w ww .java 2 s. c om return 0; }
From source file:WordCount.java
License:Apache License
public int run(String[] args) throws Exception { ///start/*from w w w .j av a 2 s.c om*/ final long startTime = System.currentTimeMillis(); String outputReducerType = "filesystem"; if (args != null && args[0].startsWith(OUTPUT_REDUCER_VAR)) { String[] s = args[0].split("="); if (s != null && s.length == 2) outputReducerType = s[1]; } logger.info("output reducer type: " + outputReducerType); // use a smaller page size that doesn't divide the row count evenly to exercise the paging logic better ConfigHelper.setRangeBatchSize(getConf(), 99); for (int i = 0; i < WordCountSetup.TEST_COUNT; i++) { String columnName = "userId"; Job job = new Job(getConf(), "wordcount"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); if (outputReducerType.equalsIgnoreCase("filesystem")) { job.setReducerClass(ReducerToFilesystem.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX + i)); } else { job.setReducerClass(ReducerToCassandra.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(ByteBuffer.class); job.setOutputValueClass(List.class); job.setOutputFormatClass(ColumnFamilyOutputFormat.class); ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY); job.getConfiguration().set(CONF_COLUMN_NAME, "sum"); } job.setInputFormatClass(ColumnFamilyInputFormat.class); ConfigHelper.setInputRpcPort(job.getConfiguration(), "9160"); ConfigHelper.setInputInitialAddress(job.getConfiguration(), "localhost"); //Change partitioner here ConfigHelper.setInputPartitioner(job.getConfiguration(), "RandomPartitioner"); ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY); SlicePredicate predicate = new SlicePredicate() .setColumn_names(Arrays.asList(ByteBufferUtil.bytes(columnName))); ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate); // this will cause the predicate to be ignored in favor of scanning everything as a wide row //Son degisiklik Super Column Support ? // ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY, true); ConfigHelper.setOutputInitialAddress(job.getConfiguration(), "localhost"); ConfigHelper.setOutputPartitioner(job.getConfiguration(), "RandomPartitioner"); job.waitForCompletion(true); } final double duration = (System.currentTimeMillis() - startTime) / 1000.0; System.out.println(); System.out.println("Job Finished in " + duration + " seconds"); System.out.println(); return 0; }
From source file:WordCount.java
License:Apache License
public int run(String[] args) throws Exception { ///start//from ww w. j ava 2 s .com final long startTime = System.currentTimeMillis(); String outputReducerType = "filesystem"; if (args != null && args[0].startsWith(OUTPUT_REDUCER_VAR)) { String[] s = args[0].split("="); if (s != null && s.length == 2) outputReducerType = s[1]; } logger.info("output reducer type: " + outputReducerType); // use a smaller page size that doesn't divide the row count evenly to exercise the paging logic better ConfigHelper.setRangeBatchSize(getConf(), 99); for (int i = 0; i < WordCountSetup.TEST_COUNT; i++) { String columnName = "userId"; Job job = new Job(getConf(), "wordcount"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); if (outputReducerType.equalsIgnoreCase("filesystem")) { job.setReducerClass(ReducerToFilesystem.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX + i)); } else { job.setReducerClass(ReducerToCassandra.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(ByteBuffer.class); job.setOutputValueClass(List.class); job.setOutputFormatClass(ColumnFamilyOutputFormat.class); ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY); job.getConfiguration().set(CONF_COLUMN_NAME, "sum"); } job.setInputFormatClass(ColumnFamilyInputFormat.class); ConfigHelper.setInputRpcPort(job.getConfiguration(), "9160"); ConfigHelper.setInputInitialAddress(job.getConfiguration(), "localhost"); ConfigHelper.setInputPartitioner(job.getConfiguration(), "RandomPartitioner"); ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY); SlicePredicate predicate = new SlicePredicate() .setColumn_names(Arrays.asList(ByteBufferUtil.bytes(columnName))); ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate); // this will cause the predicate to be ignored in favor of scanning everything as a wide row //Son degisiklik // ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY, true); //System.out.println("tessssssaaat"); ConfigHelper.setOutputInitialAddress(job.getConfiguration(), "localhost"); ConfigHelper.setOutputPartitioner(job.getConfiguration(), "RandomPartitioner"); job.waitForCompletion(true); } //print final double duration = (System.currentTimeMillis() - startTime) / 1000.0; // after System.out.println(); System.out.println("Job Finished in " + duration + " seconds"); System.out.println(); return 0; }
From source file:WordCount.java
License:Apache License
public int run(String[] args) throws Exception { ///start/*from w w w . j av a 2 s.c o m*/ final long startTime = System.currentTimeMillis(); String outputReducerType = "filesystem"; if (args != null && args[0].startsWith(OUTPUT_REDUCER_VAR)) { String[] s = args[0].split("="); if (s != null && s.length == 2) outputReducerType = s[1]; } logger.info("output reducer type: " + outputReducerType); // use a smaller page size that doesn't divide the row count evenly to exercise the paging logic better ConfigHelper.setRangeBatchSize(getConf(), 99); for (int i = 0; i < WordCountSetup.TEST_COUNT; i++) { String columnName = "userId"; Job job = new Job(getConf(), "wordcount"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); if (outputReducerType.equalsIgnoreCase("filesystem")) { job.setReducerClass(ReducerToFilesystem.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX + i)); } else { job.setReducerClass(ReducerToCassandra.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(ByteBuffer.class); job.setOutputValueClass(List.class); job.setOutputFormatClass(ColumnFamilyOutputFormat.class); ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY); job.getConfiguration().set(CONF_COLUMN_NAME, "sum"); } job.setInputFormatClass(ColumnFamilyInputFormat.class); ConfigHelper.setInputRpcPort(job.getConfiguration(), "9160"); ConfigHelper.setInputInitialAddress(job.getConfiguration(), "localhost"); ConfigHelper.setInputPartitioner(job.getConfiguration(), "RandomPartitioner"); ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY); SlicePredicate predicate = new SlicePredicate() .setColumn_names(Arrays.asList(ByteBufferUtil.bytes(columnName))); ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate); // this will cause the predicate to be ignored in favor of scanning everything as a wide row //Son degisiklik // ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY, true); //System.out.println("tessssssaaat"); ConfigHelper.setOutputInitialAddress(job.getConfiguration(), "localhost"); ConfigHelper.setOutputPartitioner(job.getConfiguration(), "RandomPartitioner"); job.waitForCompletion(true); } //print final double duration = (System.currentTimeMillis() - startTime) / 1000.0; // after System.out.println(); System.out.println("Job Finished in " + duration + " seconds"); System.out.println(); return 0; }
From source file:WordCount.java
License:Apache License
public int run(String[] args) throws Exception { ///start//from w w w . j a va 2s. c om final long startTime = System.currentTimeMillis(); String outputReducerType = "filesystem"; if (args != null && args[0].startsWith(OUTPUT_REDUCER_VAR)) { String[] s = args[0].split("="); if (s != null && s.length == 2) outputReducerType = s[1]; } logger.info("output reducer type: " + outputReducerType); // use a smaller page size that doesn't divide the row count evenly to exercise the paging logic better ConfigHelper.setRangeBatchSize(getConf(), 99); for (int i = 0; i < WordCountSetup.TEST_COUNT; i++) { String columnName = "userId"; Job job = new Job(getConf(), "wordcount"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); //System.out.println("test"); if (outputReducerType.equalsIgnoreCase("filesystem")) { job.setReducerClass(ReducerToFilesystem.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX + i)); } else { job.setReducerClass(ReducerToCassandra.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(ByteBuffer.class); job.setOutputValueClass(List.class); job.setOutputFormatClass(ColumnFamilyOutputFormat.class); ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY); job.getConfiguration().set(CONF_COLUMN_NAME, "sum"); } job.setInputFormatClass(ColumnFamilyInputFormat.class); ConfigHelper.setInputRpcPort(job.getConfiguration(), "9160"); ConfigHelper.setInputInitialAddress(job.getConfiguration(), "localhost"); ConfigHelper.setInputPartitioner(job.getConfiguration(), "RandomPartitioner"); ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY); SlicePredicate predicate = new SlicePredicate() .setColumn_names(Arrays.asList(ByteBufferUtil.bytes(columnName))); ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate); // this will cause the predicate to be ignored in favor of scanning everything as a wide row //Son degisiklik // ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY, true); ConfigHelper.setOutputInitialAddress(job.getConfiguration(), "localhost"); ConfigHelper.setOutputPartitioner(job.getConfiguration(), "RandomPartitioner"); job.waitForCompletion(true); } //print final double duration = (System.currentTimeMillis() - startTime) / 1000.0; // after System.out.println(); System.out.println("Job Finished in " + duration + " seconds"); System.out.println(); return 0; }
From source file:co.cask.hydrator.plugin.batch.source.BatchCassandraSource.java
License:Apache License
@Override public void prepareRun(BatchSourceContext context) throws Exception { Configuration conf = new Configuration(); conf.clear();/*from ww w.j a v a 2 s . c o m*/ ConfigHelper.setInputColumnFamily(conf, config.keyspace, config.columnFamily); ConfigHelper.setInputInitialAddress(conf, config.initialAddress); ConfigHelper.setInputPartitioner(conf, config.partitioner); ConfigHelper.setInputRpcPort(conf, (config.port == null) ? "9160" : Integer.toString(config.port)); Preconditions .checkArgument(!(Strings.isNullOrEmpty(config.username) ^ Strings.isNullOrEmpty(config.password)), "You must either set both username and password or neither username nor password. " + "Currently, they are username: " + config.username + " and password: " + config.password); if (!Strings.isNullOrEmpty(config.username)) { ConfigHelper.setInputKeyspaceUserNameAndPassword(conf, config.username, config.password); } if (!Strings.isNullOrEmpty(config.properties)) { for (String pair : config.properties.split(",")) { // the key and value of properties might have spaces so remove only leading and trailing ones conf.set(CharMatcher.WHITESPACE.trimFrom(pair.split(":")[0]), CharMatcher.WHITESPACE.trimFrom(pair.split(":")[1])); } } CqlConfigHelper.setInputCql(conf, config.query); context.setInput(Input.of(config.referenceName, new SourceInputFormatProvider(CqlInputFormat.class, conf))); }
From source file:com.dse.pig.udfs.AbstractCassandraStorage.java
License:Apache License
/** set hadoop cassandra connection settings */ protected void setConnectionInformation() throws IOException { if (System.getenv(PIG_RPC_PORT) != null) { ConfigHelper.setInputRpcPort(conf, System.getenv(PIG_RPC_PORT)); ConfigHelper.setOutputRpcPort(conf, System.getenv(PIG_RPC_PORT)); }// ww w. java 2 s . c o m if (System.getenv(PIG_INPUT_RPC_PORT) != null) ConfigHelper.setInputRpcPort(conf, System.getenv(PIG_INPUT_RPC_PORT)); if (System.getenv(PIG_OUTPUT_RPC_PORT) != null) ConfigHelper.setOutputRpcPort(conf, System.getenv(PIG_OUTPUT_RPC_PORT)); if (System.getenv(PIG_INITIAL_ADDRESS) != null) { ConfigHelper.setInputInitialAddress(conf, System.getenv(PIG_INITIAL_ADDRESS)); ConfigHelper.setOutputInitialAddress(conf, System.getenv(PIG_INITIAL_ADDRESS)); } if (System.getenv(PIG_INPUT_INITIAL_ADDRESS) != null) ConfigHelper.setInputInitialAddress(conf, System.getenv(PIG_INPUT_INITIAL_ADDRESS)); if (System.getenv(PIG_OUTPUT_INITIAL_ADDRESS) != null) ConfigHelper.setOutputInitialAddress(conf, System.getenv(PIG_OUTPUT_INITIAL_ADDRESS)); if (System.getenv(PIG_PARTITIONER) != null) { ConfigHelper.setInputPartitioner(conf, System.getenv(PIG_PARTITIONER)); ConfigHelper.setOutputPartitioner(conf, System.getenv(PIG_PARTITIONER)); } if (System.getenv(PIG_INPUT_PARTITIONER) != null) ConfigHelper.setInputPartitioner(conf, System.getenv(PIG_INPUT_PARTITIONER)); if (System.getenv(PIG_OUTPUT_PARTITIONER) != null) ConfigHelper.setOutputPartitioner(conf, System.getenv(PIG_OUTPUT_PARTITIONER)); if (System.getenv(PIG_INPUT_FORMAT) != null) inputFormatClass = getFullyQualifiedClassName(System.getenv(PIG_INPUT_FORMAT)); else inputFormatClass = DEFAULT_INPUT_FORMAT; if (System.getenv(PIG_OUTPUT_FORMAT) != null) outputFormatClass = getFullyQualifiedClassName(System.getenv(PIG_OUTPUT_FORMAT)); else outputFormatClass = DEFAULT_OUTPUT_FORMAT; }
From source file:com.dse.pig.udfs.CqlStorage.java
License:Apache License
/** set read configuration settings */ public void setLocation(String location, Job job) throws IOException { conf = job.getConfiguration();/*from w ww .j a va 2 s .c o m*/ setLocationFromUri(location); if (username != null && password != null) ConfigHelper.setInputKeyspaceUserNameAndPassword(conf, username, password); if (splitSize > 0) ConfigHelper.setInputSplitSize(conf, splitSize); if (partitionerClass != null) ConfigHelper.setInputPartitioner(conf, partitionerClass); if (rpcPort != null) ConfigHelper.setInputRpcPort(conf, rpcPort); if (initHostAddress != null) ConfigHelper.setInputInitialAddress(conf, initHostAddress); ConfigHelper.setInputColumnFamily(conf, keyspace, column_family); setConnectionInformation(); CqlConfigHelper.setInputCQLPageRowSize(conf, String.valueOf(pageSize)); if (columns != null && !columns.trim().isEmpty()) CqlConfigHelper.setInputColumns(conf, columns); String whereClauseForPartitionFilter = getWhereClauseForPartitionFilter(); String wc = whereClause != null && !whereClause.trim().isEmpty() ? whereClauseForPartitionFilter == null ? whereClause : String.format("%s AND %s", whereClause.trim(), whereClauseForPartitionFilter) : whereClauseForPartitionFilter; if (wc != null) { logger.debug("where clause: {}", wc); CqlConfigHelper.setInputWhereClauses(conf, wc); } if (System.getenv(PIG_INPUT_SPLIT_SIZE) != null) { try { ConfigHelper.setInputSplitSize(conf, Integer.valueOf(System.getenv(PIG_INPUT_SPLIT_SIZE))); } catch (NumberFormatException e) { throw new IOException("PIG_INPUT_SPLIT_SIZE is not a number", e); } } if (ConfigHelper.getInputRpcPort(conf) == 0) throw new IOException("PIG_INPUT_RPC_PORT or PIG_RPC_PORT environment variable not set"); if (ConfigHelper.getInputInitialAddress(conf) == null) throw new IOException("PIG_INPUT_INITIAL_ADDRESS or PIG_INITIAL_ADDRESS environment variable not set"); if (ConfigHelper.getInputPartitioner(conf) == null) throw new IOException("PIG_INPUT_PARTITIONER or PIG_PARTITIONER environment variable not set"); if (loadSignature == null) loadSignature = location; initSchema(loadSignature); }
From source file:net.orpiske.tcs.wc.main.Main.java
License:Apache License
/** * Setup the M/R job to read from the references table from Cassandra * @param configuration/*from ww w.j av a 2s . com*/ */ private void inputConfiguration(Configuration configuration) { ConfigHelper.setInputRpcPort(configuration, DB_PORT); ConfigHelper.setInputInitialAddress(configuration, DB_HOST); ConfigHelper.setInputPartitioner(configuration, PARTITIONER); ConfigHelper.setInputColumnFamily(configuration, KEYSPACE, INPUT_TABLE); List<ByteBuffer> columns = Arrays.asList(ByteBufferUtil.bytes("reference_text"), ByteBufferUtil.bytes("domain")); SlicePredicate predicate = new SlicePredicate().setColumn_names(columns); ConfigHelper.setInputSlicePredicate(configuration, predicate); }
From source file:org.apache.hadoop.hive.cassandra.input.cql.HiveCqlInputFormat.java
License:Apache License
@Override public RecordReader<MapWritableComparable, MapWritable> getRecordReader(InputSplit split, JobConf jobConf, final Reporter reporter) throws IOException { HiveCassandraStandardSplit cassandraSplit = (HiveCassandraStandardSplit) split; List<String> columns = CqlSerDe.parseColumnMapping(cassandraSplit.getColumnMapping()); List<Integer> readColIDs = ColumnProjectionUtils.getReadColumnIDs(jobConf); if (columns.size() < readColIDs.size()) { throw new IOException("Cannot read more columns than the given table contains."); }// w w w . j ava 2s . c o m ColumnFamilySplit cfSplit = cassandraSplit.getSplit(); Job job = new Job(jobConf); TaskAttemptContext tac = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()) { @Override public void progress() { reporter.progress(); } }; SlicePredicate predicate = new SlicePredicate(); predicate.setColumn_names(getColumnNames(columns, readColIDs)); try { boolean wideRows = true; ConfigHelper.setInputColumnFamily(tac.getConfiguration(), cassandraSplit.getKeyspace(), cassandraSplit.getColumnFamily(), wideRows); ConfigHelper.setInputSlicePredicate(tac.getConfiguration(), predicate); ConfigHelper.setRangeBatchSize(tac.getConfiguration(), cassandraSplit.getRangeBatchSize()); ConfigHelper.setInputRpcPort(tac.getConfiguration(), cassandraSplit.getPort() + ""); ConfigHelper.setInputInitialAddress(tac.getConfiguration(), cassandraSplit.getHost()); ConfigHelper.setInputPartitioner(tac.getConfiguration(), cassandraSplit.getPartitioner()); // Set Split Size ConfigHelper.setInputSplitSize(tac.getConfiguration(), cassandraSplit.getSplitSize()); LOG.info("Validators : " + tac.getConfiguration().get(CassandraColumnSerDe.CASSANDRA_VALIDATOR_TYPE)); List<IndexExpression> indexExpr = parseFilterPredicate(jobConf); if (indexExpr != null) { //We have pushed down a filter from the Hive query, we can use this against secondary indexes ConfigHelper.setInputRange(tac.getConfiguration(), indexExpr); } CqlHiveRecordReader rr = new CqlHiveRecordReader(new CqlPagingRecordReader()); rr.initialize(cfSplit, tac); return rr; } catch (Exception ie) { throw new IOException(ie); } }