Java tutorial
/* * Copyright 2016 The Bazel Authors. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.idea.blaze.base.lang.buildfile.lexer; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import java.util.List; import java.util.Map; import java.util.Stack; import javax.annotation.Nullable; /** * A tokenizer for the BUILD language. * * <p>Copied from blaze/bazel's lexer. The differences are: 1. Blaze's lexer isn't 'faithful', in * that it reorders characters, skips characters, and adds ghost characters. We can't do that, * because we need to match the editor's view of the document. 2. Blaze's lexer only lexes entire * files (it can't incrementally lex part of a file, starting from a given indent stack depth). */ public class BuildLexerBase { /** * When tokenizing for the purposes of parsing, we handle indentation.<br> * For syntax highlighting, we need to tokenize every character, and don't care about indentation. */ public enum LexerMode { Parsing, SyntaxHighlighting } // Characters that can come immediately prior to an '=' character to generate // a different token private static final Map<Character, TokenKind> EQUAL_TOKENS = ImmutableMap.<Character, TokenKind>builder() .put('=', TokenKind.EQUALS_EQUALS).put('!', TokenKind.NOT_EQUALS).put('>', TokenKind.GREATER_EQUALS) .put('<', TokenKind.LESS_EQUALS).put('+', TokenKind.PLUS_EQUALS).put('-', TokenKind.MINUS_EQUALS) .put('*', TokenKind.STAR_EQUALS).put('/', TokenKind.SLASH_EQUALS).put('%', TokenKind.PERCENT_EQUALS) .build(); private final LexerMode mode; // Input buffer and position private final char[] buffer; private int pos; private final List<Token> tokens; // The number of unclosed open-parens ("(", '{', '[') at the current point in // the stream. Whitespace is handled differently when this is nonzero. private int openParenStackDepth = 0; // The stack of enclosing indentation levels; always contains '0' at the // bottom. private final Stack<Integer> indentStack = new Stack<>(); private boolean containsErrors; /** * Constructs a lexer which tokenizes the contents of the specified InputBuffer. Any errors during * lexing are reported on "handler". */ public BuildLexerBase(CharSequence input, int initialStackDepth, LexerMode mode) { this.buffer = input.toString().toCharArray(); // Empirical measurements show roughly 1 token per 8 characters in buffer. this.tokens = Lists.newArrayListWithExpectedSize(buffer.length / 8); this.pos = 0; this.openParenStackDepth = initialStackDepth; this.mode = mode; indentStack.push(0); tokenize(); } /** The number of unclosed open-parens ("(", '{', '[') at the end of this string. */ public int getOpenParenStackDepth() { return openParenStackDepth; } /** * Returns true if there were errors during scanning of this input file or string. The * BuildLexerBase may attempt to recover from errors, but clients should not rely on the results * of scanning if this flag is set. */ public boolean containsErrors() { return containsErrors; } /** Returns the (mutable) list of tokens generated by the BuildLexerBase. */ public List<Token> getTokens() { return tokens; } private void popParen() { if (openParenStackDepth == 0) { error("indentation error"); } else { openParenStackDepth--; } } private void error(String message) { error(message, pos - 1, pos - 1); } protected void error(String message, int start, int end) { this.containsErrors = true; } /** invariant: symbol positions are half-open intervals. */ private void addToken(TokenKind kind, int left, int right) { addToken(kind, left, right, null); } private void addToken(TokenKind kind, int left, int right, @Nullable Object value) { tokens.add(new Token(kind, left, right, value)); } /** * Parses an end-of-line sequence, handling statement indentation correctly. * * <p>UNIX newlines are assumed (LF). Carriage returns are always ignored. * * <p>ON ENTRY: 'pos' is the index of the char after '\n'. ON EXIT: 'pos' is the index of the next * non-space char after '\n'. */ protected void newline() { if (mode == LexerMode.SyntaxHighlighting) { addToken(TokenKind.NEWLINE, pos - 1, pos); return; } if (openParenStackDepth > 0) { newlineInsideExpression(); // in an expression: ignore space } else { newlineOutsideExpression(); // generate NEWLINE/INDENT/DEDENT tokens } } private void newlineInsideExpression() { int oldPos = pos - 1; while (pos < buffer.length) { switch (buffer[pos]) { case ' ': case '\t': case '\r': pos++; break; default: // ignored by the parser addToken(TokenKind.WHITESPACE, oldPos, pos); return; } } addToken(TokenKind.WHITESPACE, oldPos, pos); } /** * Handle INDENT and DEDENT within statements. * * <p>Note these tokens have zero length -- this is because we can have an arbitrary number of * dedents squeezed into some number of characters, and the size of all the lexical elements must * match the number of characters in the file. */ private void newlineOutsideExpression() { int oldPos = pos - 1; if (pos > 1) { // skip over newline at start of file addToken(TokenKind.NEWLINE, oldPos, pos); oldPos = pos; } // we're in a stmt: suck up space at beginning of next line int indentLen = 0; while (pos < buffer.length) { char c = buffer[pos]; if (c == ' ') { indentLen++; pos++; } else if (c == '\t') { indentLen += 8 - indentLen % 8; pos++; } else if (c == '\n') { // entirely blank line: ignore indentLen = 0; pos++; } else if (c == '#') { // line containing only indented comment if (oldPos != pos) { addToken(TokenKind.WHITESPACE, oldPos, pos); oldPos = pos; } while (pos < buffer.length && c != '\n') { c = buffer[pos++]; } addToken(TokenKind.COMMENT, oldPos, pos - 1, bufferSlice(oldPos, pos - 1)); oldPos = pos - 1; indentLen = 0; } else { // printing character break; } } if (oldPos != pos) { addToken(TokenKind.WHITESPACE, oldPos, pos); } if (pos == buffer.length) { indentLen = 0; } // trailing space on last line int peekedIndent = indentStack.peek(); if (peekedIndent < indentLen) { // push a level indentStack.push(indentLen); addToken(TokenKind.INDENT, pos, pos); } else if (peekedIndent > indentLen) { // pop one or more levels while (peekedIndent > indentLen) { indentStack.pop(); addToken(TokenKind.DEDENT, pos, pos); peekedIndent = indentStack.peek(); } if (peekedIndent < indentLen) { error("indentation error"); } } } /** Collapse adjacent whitespace characters into a single token */ private void addWhitespace() { int oldPos = pos - 1; while (pos < buffer.length) { switch (buffer[pos]) { case ' ': case '\t': case '\r': pos++; break; default: addToken(TokenKind.WHITESPACE, oldPos, pos, bufferSlice(oldPos, pos)); return; } } addToken(TokenKind.WHITESPACE, oldPos, pos, bufferSlice(oldPos, pos)); } /** * Returns true if current position is in the middle of a triple quote delimiter (3 x quot), and * advances 'pos' by two if so. */ private boolean skipTripleQuote(char quot) { if (pos + 1 < buffer.length && buffer[pos] == quot && buffer[pos + 1] == quot) { pos += 2; return true; } else { return false; } } /** * Scans a string literal delimited by 'quot', containing escape sequences. * * <p>ON ENTRY: 'pos' is 1 + the index of the first delimiter ON EXIT: 'pos' is 1 + the index of * the last delimiter. */ private void escapedStringLiteral(char quot, boolean isRaw) { int oldPos = isRaw ? pos - 2 : pos - 1; boolean inTripleQuote = skipTripleQuote(quot); // more expensive second choice that expands escaped into a buffer StringBuilder literal = new StringBuilder(); while (pos < buffer.length) { char c = buffer[pos]; pos++; switch (c) { case '\n': if (inTripleQuote) { literal.append(c); break; } else { error("unterminated string literal at eol", oldPos, pos); addToken(TokenKind.STRING, oldPos, pos - 1, literal.toString()); newline(); return; } case '\\': if (pos == buffer.length) { error("unterminated string literal at eof", oldPos, pos); addToken(TokenKind.STRING, oldPos, pos - 1, literal.toString()); return; } if (isRaw) { // Insert \ and the following character. // As in Python, it means that a raw string can never end with a single \. literal.append('\\'); literal.append(buffer[pos]); pos++; break; } c = buffer[pos]; pos++; switch (c) { case '\n': // ignore end of line character break; case 'n': literal.append('\n'); break; case 'r': literal.append('\r'); break; case 't': literal.append('\t'); break; case '\\': literal.append('\\'); break; case '\'': literal.append('\''); break; case '"': literal.append('"'); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { // octal escape int octal = c - '0'; if (pos < buffer.length) { c = buffer[pos]; if (c >= '0' && c <= '7') { pos++; octal = (octal << 3) | (c - '0'); if (pos < buffer.length) { c = buffer[pos]; if (c >= '0' && c <= '7') { pos++; octal = (octal << 3) | (c - '0'); } } } } literal.append((char) (octal & 0xff)); break; } case 'a': case 'b': case 'f': case 'N': case 'u': case 'U': case 'v': case 'x': // exists in Python but not implemented in Blaze => error error("escape sequence not implemented: \\" + c, oldPos, pos); break; default: // unknown char escape => "\literal" literal.append('\\'); literal.append(c); break; } break; case '\'': case '"': if (c != quot || (inTripleQuote && !skipTripleQuote(quot))) { // Non-matching quote, treat it like a regular char. literal.append(c); } else { // Matching close-delimiter, all done. addToken(TokenKind.STRING, oldPos, pos, literal.toString()); return; } break; default: literal.append(c); break; } } error("unterminated string literal at eof", oldPos, pos); addToken(TokenKind.STRING, oldPos, pos, literal.toString()); } /** * Scans a string literal delimited by 'quot'. * * <ul> * <li> ON ENTRY: 'pos' is 1 + the index of the first delimiter * <li> ON EXIT: 'pos' is 1 + the index of the last delimiter. * </ul> * * @param isRaw if true, do not escape the string. */ private void addStringLiteral(char quot, boolean isRaw) { int oldPos = isRaw ? pos - 2 : pos - 1; int start = pos; // Don't even attempt to parse triple-quotes here. if (skipTripleQuote(quot)) { pos -= 2; escapedStringLiteral(quot, isRaw); return; } // first quick optimistic scan for a simple non-escaped string while (pos < buffer.length) { char c = buffer[pos++]; switch (c) { case '\n': error("unterminated string literal at eol", oldPos, pos); addToken(TokenKind.STRING, oldPos, pos - 1, bufferSlice(start, pos - 1)); newline(); return; case '\\': if (isRaw) { // skip the next character pos++; break; } // oops, hit an escape, need to start over & build a new string buffer pos = oldPos + 1; escapedStringLiteral(quot, false); return; case '\'': case '"': if (c == quot) { // close-quote, all done. addToken(TokenKind.STRING, oldPos, pos, bufferSlice(start, pos - 1)); return; } } } error("unterminated string literal at eof", oldPos, pos); addToken(TokenKind.STRING, oldPos, pos, bufferSlice(start, pos)); } private static final ImmutableMap<String, TokenKind> KEYWORD_MAP = createKeywordMap(); private static ImmutableMap<String, TokenKind> createKeywordMap() { ImmutableMap.Builder<String, TokenKind> builder = ImmutableMap.builder(); for (TokenKind kind : TokenKind.KEYWORDS) { builder.put(kind.toString(), kind); } return builder.build(); } private TokenKind getTokenKindForIdentfier(String id) { TokenKind kind = KEYWORD_MAP.get(id); return kind == null ? TokenKind.IDENTIFIER : kind; } private String scanIdentifier() { int oldPos = pos - 1; while (pos < buffer.length) { switch (buffer[pos]) { case '_': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': pos++; break; default: return bufferSlice(oldPos, pos); } } return bufferSlice(oldPos, pos); } /** * Scans an identifier or keyword. * * <p>ON ENTRY: 'pos' is 1 + the index of the first char in the identifier. ON EXIT: 'pos' is 1 + * the index of the last char in the identifier. * * @return the identifier or keyword token. */ private void addIdentifierOrKeyword() { int oldPos = pos - 1; String id = scanIdentifier(); TokenKind kind = getTokenKindForIdentfier(id); addToken(kind, oldPos, pos, (kind == TokenKind.IDENTIFIER) ? id : null); } private String scanInteger() { int oldPos = pos - 1; while (pos < buffer.length) { char c = buffer[pos]; switch (c) { case 'X': case 'x': case 'a': case 'A': case 'b': case 'B': case 'c': case 'C': case 'd': case 'D': case 'e': case 'E': case 'f': case 'F': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': pos++; break; default: return bufferSlice(oldPos, pos); } } return bufferSlice(oldPos, pos); } /** * Scans an addInteger literal. * * <p>ON ENTRY: 'pos' is 1 + the index of the first char in the literal. ON EXIT: 'pos' is 1 + the * index of the last char in the literal. */ private void addInteger() { int oldPos = pos - 1; String literal = scanInteger(); final String substring; final int radix; if (literal.startsWith("0x") || literal.startsWith("0X")) { radix = 16; substring = literal.substring(2); } else if (literal.startsWith("0") && literal.length() > 1) { radix = 8; substring = literal.substring(1); } else { radix = 10; substring = literal; } int value = 0; try { value = Integer.parseInt(substring, radix); } catch (NumberFormatException e) { error("invalid base-" + radix + " integer constant: " + literal); } addToken(TokenKind.INT, oldPos, pos, value); } /** * Tokenizes a two-char operator. * * @return true if it tokenized an operator */ private boolean tokenizeTwoChars() { if (pos + 2 >= buffer.length) { return false; } char c1 = buffer[pos]; char c2 = buffer[pos + 1]; TokenKind tok = null; if (c2 == '=') { tok = EQUAL_TOKENS.get(c1); } else if (c2 == '*' && c1 == '*') { tok = TokenKind.STAR_STAR; } if (tok == null) { return false; } addToken(tok, pos, pos + 2); return true; } /** Performs tokenization of the character buffer of file contents provided to the constructor. */ private void tokenize() { while (pos < buffer.length) { if (tokenizeTwoChars()) { pos += 2; continue; } char c = buffer[pos]; pos++; switch (c) { case '{': { addToken(TokenKind.LBRACE, pos - 1, pos); openParenStackDepth++; break; } case '}': { addToken(TokenKind.RBRACE, pos - 1, pos); popParen(); break; } case '(': { addToken(TokenKind.LPAREN, pos - 1, pos); openParenStackDepth++; break; } case ')': { addToken(TokenKind.RPAREN, pos - 1, pos); popParen(); break; } case '[': { addToken(TokenKind.LBRACKET, pos - 1, pos); openParenStackDepth++; break; } case ']': { addToken(TokenKind.RBRACKET, pos - 1, pos); popParen(); break; } case '>': { addToken(TokenKind.GREATER, pos - 1, pos); break; } case '<': { addToken(TokenKind.LESS, pos - 1, pos); break; } case ':': { addToken(TokenKind.COLON, pos - 1, pos); break; } case ',': { addToken(TokenKind.COMMA, pos - 1, pos); break; } case '+': { addToken(TokenKind.PLUS, pos - 1, pos); break; } case '-': { addToken(TokenKind.MINUS, pos - 1, pos); break; } case '|': { addToken(TokenKind.PIPE, pos - 1, pos); break; } case '=': { addToken(TokenKind.EQUALS, pos - 1, pos); break; } case '%': { addToken(TokenKind.PERCENT, pos - 1, pos); break; } case '/': { addToken(TokenKind.SLASH, pos - 1, pos); break; } case ';': { addToken(TokenKind.SEMI, pos - 1, pos); break; } case '.': { addToken(TokenKind.DOT, pos - 1, pos); break; } case '*': { addToken(TokenKind.STAR, pos - 1, pos); break; } case ' ': case '\t': case '\r': { addWhitespace(); break; } case '\\': { // Backslash character is valid only at the end of a line (or in a string) if (pos + 1 < buffer.length && buffer[pos] == '\n') { // treat end of line backslash and newline char as whitespace // (they're ignored by the parser) pos++; addToken(TokenKind.WHITESPACE, pos - 2, pos, Character.toString(c)); } else { addToken(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c)); } break; } case '\n': { newline(); break; } case '#': { int oldPos = pos - 1; while (pos < buffer.length) { c = buffer[pos]; if (c == '\n') { break; } else { pos++; } } addToken(TokenKind.COMMENT, oldPos, pos, bufferSlice(oldPos, pos)); break; } case '\'': case '\"': { addStringLiteral(c, false); break; } default: { // detect raw strings, e.g. r"str" if (c == 'r' && pos < buffer.length && (buffer[pos] == '\'' || buffer[pos] == '\"')) { c = buffer[pos]; pos++; addStringLiteral(c, true); break; } if (Character.isDigit(c)) { addInteger(); } else if (Character.isJavaIdentifierStart(c) && c != '$') { addIdentifierOrKeyword(); } else { // Some characters in Python are not recognized in Blaze syntax (e.g. '!') addToken(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c)); error("invalid character: '" + c + "'"); } break; } // default } // switch } // while } /** * Returns parts of the source buffer based on offsets * * @param start the beginning offset for the slice * @param end the offset immediately following the slice * @return the text at offset start with length end - start */ private String bufferSlice(int start, int end) { return new String(this.buffer, start, end - start); } }