Java tutorial
/** * Copyright (C) 2016 Joshua Auerbach * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.qcert.sql; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.antlr.v4.runtime.ANTLRInputStream; import org.antlr.v4.runtime.CharStream; import org.antlr.v4.runtime.Token; import com.facebook.presto.sql.parser.CaseInsensitiveStream; import com.facebook.presto.sql.parser.ParsingException; import com.facebook.presto.sql.parser.SqlBaseLexer; import com.facebook.presto.sql.parser.SqlParser; import com.facebook.presto.sql.parser.StatementSplitter; import com.facebook.presto.sql.tree.CreateView; import com.facebook.presto.sql.tree.Node; import com.facebook.presto.sql.tree.Query; import com.facebook.presto.sql.tree.QuerySpecification; import com.facebook.presto.sql.tree.Select; import com.facebook.presto.sql.tree.SelectItem; import com.facebook.presto.sql.tree.SingleColumn; import com.facebook.presto.sql.tree.Statement; /** * Utilities that work with presto-parser to produce other useful forms */ public class PrestoEncoder { /** Causes printing of SQL before and after lexical fixup */ private static final boolean VERBOSE_LEXICAL = Boolean.getBoolean("VERBOSE_LEXICAL"); /** * Encode a list of presto tree nodes as an S-expression for importing into Coq code * @param toEncode the list of presto tree nodes to encode * @param useDateNameHeuristic flag indicating that field names ending in "date" denote dates * @return the S-expression string */ public static String encode(List<? extends Node> toEncode, boolean useDateNameHeuristic) { StringBuilder buffer = new StringBuilder(); buffer.append("(statements "); for (Node node : toEncode) { encode(buffer, node, useDateNameHeuristic); } buffer.append(")"); return buffer.toString(); } /** * Encode an individual presto tree node as an S-expression using an existing buffer * @param buffer the existing buffer * @param toEncode the presto tree node * @return the buffer, for convenience */ public static StringBuilder encode(StringBuilder buffer, Node toEncode, boolean useDateNameHeuristic) { return new EncodingVisitor(useDateNameHeuristic).process(toEncode, buffer); } /** * Parse a SQL source string. First separates it into statements, then parses the Statements. * @param sourceString the SQL source string * @return the parsed statement(s) as an List<Statement> */ public static List<Statement> parse(String query) { StatementSplitter splitter = new StatementSplitter(query); SqlParser parser = new SqlParser(); ArrayList<Statement> results = new ArrayList<Statement>(1); for (com.facebook.presto.sql.parser.StatementSplitter.Statement statement : splitter .getCompleteStatements()) { String body = statement.statement(); Statement result = parseStatement(parser, body); results.add(result); } if (results.isEmpty()) { throw new ParsingException("input query does not contain any statements"); } return results; } /** * Parse a SQL source in string form into one or more presto nodes and then encode the result as * an S-expression string * @param sourceString the SQL source string to parse and then encode * @param interleaved if true, each statement is parsed then encoded with the outer loop iterating through statements; * otherwise, all parsing is done, and then all encoding * @param useDateNameHeuristic if true, all field names ending in "date" are assumed to contain dates * @return the S-expression string */ public static String parseAndEncode(String sourceString, boolean interleaved, boolean useDateNameHeuristic) { if (interleaved) return interleavedParseAndEncode(sourceString, useDateNameHeuristic); return encode(parse(sourceString), useDateNameHeuristic); } /** * Apply necessary fixups at the lexical level (needed to get the query to even be parsed by presto-parser). * 1. Convert occurances of 'NN [days|months|years]' to 'interval NN [day|month|year]' (needed by many TPC-DS queries). * 2. Remove parenthesized numeric field after an interval unit field (needed to run TPC-H query 1). * 3. Remove parenthesized name list in 'create view NAME (...) as' and relocate the names into the body of the statement (needed to run TPC-H query 15). * 4. On a 'create table' (schema), remove occurances of NOT NULL, which presto does not handle. * Fixup 3 is only partially lexical; the lexical phase remembers the names and a visitor updates the body after parsing. * @param query the original query * @param foundNames an initially empty list to which names found in fixup 3 may be added for later processing * @return the altered query */ private static String applyLexicalFixups(String query, List<String> foundNames) { if (VERBOSE_LEXICAL) { System.out.println("Before:"); System.out.println(query); } CharStream stream = new CaseInsensitiveStream(new ANTLRInputStream(query)); SqlBaseLexer lexer = new SqlBaseLexer(stream); StringBuilder buffer = new StringBuilder(); Token savedInteger = null; List<Token> savedWS = new ArrayList<>(); FixupState state = FixupState.OPEN; for (Token token : lexer.getAllTokens()) { /* The 'state' is used for fixups 2 and 3 */ switch (state) { case ELIDE1: state = FixupState.ELIDE2; continue; case ELIDE2: state = FixupState.OPEN; continue; case ELIDELIST: if (token.getType() == SqlBaseLexer.AS) { buffer.append(token.getText()); state = FixupState.OPEN; } else if (token.getType() != SqlBaseLexer.WS && !token.getText().equals(",") && !token.getText().equals(")")) { foundNames.add(token.getText()); } continue; case CREATE: buffer.append(token.getText()); if (token.getType() == SqlBaseLexer.VIEW) state = FixupState.VIEW; else if (token.getType() == SqlBaseLexer.TABLE) state = FixupState.TABLE; else if (token.getType() != SqlBaseLexer.WS) state = FixupState.OPEN; continue; case VIEW: if (token.getText().equals("(")) { state = FixupState.ELIDELIST; } else { buffer.append(token.getText()); if (token.getType() == SqlBaseLexer.AS) state = FixupState.OPEN; } continue; case TABLE: if (token.getType() != SqlBaseLexer.NOT && token.getType() != SqlBaseLexer.NULL) buffer.append(token.getText()); continue; case INTERVAL: buffer.append(token.getText()); if (getUnit(token.getText()) != null) state = FixupState.UNIT; continue; case UNIT: if (token.getText().equals("(")) { state = FixupState.ELIDE1; } else { buffer.append(token.getText()); if (token.getType() != SqlBaseLexer.WS) state = FixupState.OPEN; } continue; case OPEN: if (token.getType() == SqlBaseLexer.INTERVAL) { state = FixupState.INTERVAL; buffer.append(token.getText()); continue; } else if (token.getType() == SqlBaseLexer.CREATE) { state = FixupState.CREATE; buffer.append(token.getText()); continue; } // If 'open' and there is not a transition to another state, break the switch and try fixup 1. This should be the only break in the switch. break; } /* The 'savedInteger' is used for fixup 1 */ if (token.getType() == SqlBaseLexer.INTEGER_VALUE) savedInteger = token; else if (savedInteger != null) { String unit = getUnit(token.getText()); if (unit != null) { buffer.append("interval '").append(savedInteger.getText()).append("' ").append(unit); savedInteger = null; savedWS.clear(); } else if (token.getType() == SqlBaseLexer.WS) savedWS.add(token); else { buffer.append(savedInteger.getText()); for (Token ws : savedWS) buffer.append(ws.getText()); buffer.append(token.getText()); savedInteger = null; savedWS.clear(); } } else buffer.append(token.getText()); } if (savedInteger != null) buffer.append(savedInteger.getText()); query = buffer.toString(); if (VERBOSE_LEXICAL) { System.out.println("After:"); System.out.println(query); } return query; } /** * Distribute names found during lexical fixup of a 'create view' statement * @param view the statement * @param names the names * @return a new create view statement with the names distributed into the body */ private static CreateView distributeNames(CreateView view, List<String> names) { if (view.getQuery().getQueryBody() instanceof QuerySpecification) { QuerySpecification body = (QuerySpecification) view.getQuery().getQueryBody(); Select select = body.getSelect(); if (select.getSelectItems().size() != names.size()) throw new IllegalStateException( "Don't know how to distribute names for 'create view' statement when the number of select items doesn't match the number of names"); List<SelectItem> newItems = new ArrayList<>(); Iterator<String> namesIter = names.iterator(); for (SelectItem oldItem : select.getSelectItems()) { newItems.add(new SingleColumn(((SingleColumn) oldItem).getExpression(), namesIter.next())); } body = new QuerySpecification(new Select(select.isDistinct(), newItems), body.getFrom(), body.getWhere(), body.getGroupBy(), body.getHaving(), body.getOrderBy(), body.getLimit()); return new CreateView(view.getName(), new Query(view.getQuery().getWith(), body, view.getQuery().getOrderBy(), view.getQuery().getLimit(), view.getQuery().getApproximate()), view.isReplace()); } throw new IllegalStateException( "Don't know how to distribute names for 'create view' statement when body is not a QuerySpecification"); } /** * Utility to recognize interval unit names in either singular or plural form and return the singular of same */ private static String getUnit(String text) { switch (text.trim().toLowerCase()) { case "days": case "day": return "day"; case "months": case "month": return "month"; case "years": case "year": return "year"; } return null; } /** * Alternative parse/encode loop useful when there are lots of statements. Isolates what parses but does not encode from * what doesn't parse at all * @param sourceString the source string * @param useDateNameHeuristic if true, all field names ending in "date" are assumed to denote dates * @return the parsed String, which might be vacuous, while rejecting all statements that don't parse (encoding errors still * cause termination) */ private static String interleavedParseAndEncode(String sourceString, boolean useDateNameHeuristic) { StatementSplitter splitter = new StatementSplitter(sourceString); SqlParser parser = new SqlParser(); StringBuilder buffer = new StringBuilder().append("(statements "); int successes = 0; for (StatementSplitter.Statement statement : splitter.getCompleteStatements()) { String body = statement.statement(); Statement result; try { result = parseStatement(parser, body); } catch (Exception e) { String msg = e.getMessage(); if (msg == null) e.printStackTrace(); System.out.println(msg == null ? e.toString() : msg); continue; } try { encode(buffer, result, useDateNameHeuristic); successes++; } catch (Exception e) { System.out.println("Successes: " + successes); throw e; } } System.out.println("Successes: " + successes); return buffer.append(")").toString(); } /** Parse an individual statement, applying lexical fixups first */ private static Statement parseStatement(SqlParser parser, String body) { List<String> names = new ArrayList<>(); body = applyLexicalFixups(body, names); Statement result = parser.createStatement(body); if (!names.isEmpty()) result = distributeNames((CreateView) result, names); // No CCE possible since result must be a CreateView in this case return result; } /** Enumeration used in lexical fixups */ private enum FixupState { OPEN, /* no interval or create view observed */ INTERVAL, /* observed 'interval' but not yet any unit */ UNIT, /* observed a unit, not yet eliding */ ELIDE1, /* started eliding paren'd number */ ELIDE2, /* still eliding paren'd number */ CREATE, /* observed 'create', not yet 'view' or 'table' */ VIEW, /* observed 'view' after 'create' */ ELIDELIST, /* observed parenthesized list after 'view', before 'as' and names are being elided and saved */ TABLE, /* observed 'table' after 'create' */ } }