Java tutorial
/* * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cascading.hive; import java.sql.Connection; import java.sql.DriverManager; import java.sql.ResultSet; import java.sql.Statement; import java.util.Properties; import cascading.flow.Flow; import cascading.flow.FlowProcess; import cascading.flow.hadoop.HadoopFlowConnector; import cascading.operation.BaseOperation; import cascading.operation.Filter; import cascading.operation.FilterCall; import cascading.operation.Function; import cascading.operation.FunctionCall; import cascading.pipe.Each; import cascading.pipe.Pipe; import cascading.pipe.assembly.Retain; import cascading.property.AppProps; import cascading.scheme.hadoop.TextDelimited; import cascading.tap.SinkMode; import cascading.tap.Tap; import cascading.tap.hadoop.Hfs; import cascading.tap.hive.HivePartitionTap; import cascading.tap.hive.HiveTableDescriptor; import cascading.tap.hive.HiveTap; import cascading.tuple.Fields; import cascading.tuple.TupleEntry; import cascading.tuple.TupleEntryIterator; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobConf; /** * Demo app showing partitioning support. */ public class HivePartitionDemo { private static String accessLog = "file://" + System.getProperty("user.dir") + "/src/main/resources/access.log"; public static void main(String[] args) throws Exception { Properties properties = new Properties(); AppProps.setApplicationName(properties, "cascading hive partitioning demo"); JobConf jobConf = new JobConf(); FileSystem fs = FileSystem.get(jobConf); fs.copyFromLocalFile(false, true, new Path(accessLog), new Path("/tmp/access.log")); String[] columnNames = new String[] { "ts", "customer", "bucket", "operation", "key", "region" }; String[] columnTypes = new String[] { "timestamp", "string", "string", "string", "string", "string" }; String[] partitionKeys = new String[] { "region" }; HiveTableDescriptor partitionedDescriptor = new HiveTableDescriptor("mydb", "mytable", columnNames, columnTypes, partitionKeys, "\t"); HiveTap hiveTap = new HiveTap(partitionedDescriptor, partitionedDescriptor.toScheme()); Tap partitionTap = new HivePartitionTap(hiveTap); Fields allFields = new Fields(columnNames); Tap input = new Hfs(new TextDelimited(allFields), "hdfs:/tmp/access.log"); class Echo extends BaseOperation implements Function { public Echo(Fields fieldDeclaration) { super(2, fieldDeclaration); } @Override public void operate(FlowProcess flowProcess, FunctionCall functionCall) { TupleEntry argument = functionCall.getArguments(); functionCall.getOutputCollector().add(argument.getTuple()); } } Pipe pipe = new Each(" import ", allFields, new Echo(allFields), Fields.RESULTS); Flow flow = new HadoopFlowConnector().connect(input, partitionTap, pipe); flow.complete(); Class.forName("org.apache.hadoop.hive.jdbc.HiveDriver"); Connection con = DriverManager.getConnection("jdbc:hive://", "", ""); Statement stmt = con.createStatement(); ResultSet rs = stmt.executeQuery("select * from mydb.mytable where region = 'ASIA' "); String[] names = partitionedDescriptor.getColumnNames(); System.out.println("----------------------Hive JDBC--------------------------"); while (rs.next()) { StringBuffer buf = new StringBuffer("JDBC>>> "); for (int i = 0; i < names.length; i++) { String name = names[i]; buf.append(name).append("=").append(rs.getObject(i + 1)).append(", "); } System.out.println(buf.toString()); } System.out.println("---------------------------------------------------------"); stmt.close(); con.close(); // do the same as the JDBC above, but in Cascading. class RegionFilter extends BaseOperation implements Filter { final String region; public RegionFilter(String region) { this.region = region; } @Override public boolean isRemove(FlowProcess flowProcess, FilterCall filterCall) { if (filterCall.getArguments().getString("region").equals(this.region)) return false; return true; } } Tap requestsInAsiaSink = new Hfs(new TextDelimited(allFields), "hdfs:/tmp/requests-from-asia", SinkMode.REPLACE); Pipe headPipe = new Each("requests from ASIA", allFields, new RegionFilter("ASIA")); Flow headFlow = new HadoopFlowConnector().connect(partitionTap, requestsInAsiaSink, headPipe); headFlow.complete(); TupleEntryIterator tupleEntryIterator = requestsInAsiaSink.openForRead(headFlow.getFlowProcess()); while (tupleEntryIterator.hasNext()) { TupleEntry tupleEntry = tupleEntryIterator.next(); System.out.println("Cascading>>> " + tupleEntry); } tupleEntryIterator.close(); } }