From 0c40637b875653dc0a73ef4ebeecfbd8a8e74bc3 Mon Sep 17 00:00:00 2001 From: Steve Dignam Date: Fri, 26 Jun 2026 22:49:11 -0400 Subject: [PATCH] parser/lexer: national strings own type, $$ quoted string err recovery --- crates/squawk_fmt/src/fmt.rs | 1 + crates/squawk_ide/src/column_name.rs | 8 +- crates/squawk_ide/src/hover.rs | 15 ++ crates/squawk_ide/src/infer.rs | 1 + crates/squawk_ide/src/literals.rs | 4 + crates/squawk_ide/src/tokens.rs | 1 + crates/squawk_lexer/src/cursor.rs | 5 + crates/squawk_lexer/src/lib.rs | 112 +++++++++-- .../squawk_lexer__tests__quoted_ident.snap | 4 +- ...tests__quoted_ident_with_escape_quote.snap | 2 +- ...k_lexer__tests__string_unicode_escape.snap | 6 +- crates/squawk_lexer/src/token.rs | 4 +- .../src/generated/syntax_kind.rs | 1 + crates/squawk_parser/src/grammar.rs | 8 +- crates/squawk_parser/src/lexed_str.rs | 31 ++- crates/squawk_parser/src/lib.rs | 66 +++++-- .../tests/data/ok/operator_trailing_signs.sql | 13 ++ .../tests__operator_trailing_signs_ok.snap | 187 ++++++++++++++++++ .../snapshots/tests__select_literal_err.snap | 8 +- crates/squawk_syntax/src/ast/node_ext.rs | 2 + crates/squawk_syntax/src/postgresql.ungram | 1 + ...ax__test__custom_operators_validation.snap | 126 ++++++------ ...test__dollar_quoted_string_validation.snap | 20 +- crates/squawk_syntax/src/validation.rs | 50 +---- .../test_data/validation/custom_operators.sql | 2 +- 25 files changed, 510 insertions(+), 168 deletions(-) create mode 100644 crates/squawk_parser/tests/data/ok/operator_trailing_signs.sql create mode 100644 crates/squawk_parser/tests/snapshots/tests__operator_trailing_signs_ok.snap diff --git a/crates/squawk_fmt/src/fmt.rs b/crates/squawk_fmt/src/fmt.rs index deba6fc71..6ea5de6f4 100644 --- a/crates/squawk_fmt/src/fmt.rs +++ b/crates/squawk_fmt/src/fmt.rs @@ -492,6 +492,7 @@ fn build_literal<'a>(lit: ast::Literal) -> Doc<'a> { | LitKind::ByteString(_) | LitKind::DollarQuotedString(_) | LitKind::EscString(_) + | LitKind::NationalString(_) | LitKind::String(_) | LitKind::UnicodeEscString(_) => build_string_literal(&lit), } diff --git a/crates/squawk_ide/src/column_name.rs b/crates/squawk_ide/src/column_name.rs index 04eed9b80..2fde85bd2 100644 --- a/crates/squawk_ide/src/column_name.rs +++ b/crates/squawk_ide/src/column_name.rs @@ -486,9 +486,11 @@ fn name_from_expr(expr: ast::Expr, in_type: bool) -> Option<(ColumnName, SyntaxN } } ast::Expr::Literal(literal) => { - if literal.syntax().first_token().is_some_and(|token| { - token.kind() == SyntaxKind::STRING && token.text().starts_with(['n', 'N']) - }) { + if literal + .syntax() + .first_token() + .is_some_and(|token| token.kind() == SyntaxKind::NATIONAL_STRING) + { return Some((ColumnName::UnknownColumn(Some("bpchar".to_string())), node)); } return Some((ColumnName::UnknownColumn(None), node)); diff --git a/crates/squawk_ide/src/hover.rs b/crates/squawk_ide/src/hover.rs index a3286d668..db56af121 100644 --- a/crates/squawk_ide/src/hover.rs +++ b/crates/squawk_ide/src/hover.rs @@ -156,6 +156,7 @@ fn hover_literal(literal: &ast::Literal) -> Option { | LitKind::BitString(_) | LitKind::ByteString(_) | LitKind::EscString(_) + | LitKind::NationalString(_) | LitKind::UnicodeEscString(_) | LitKind::DollarQuotedString(_) ) { @@ -170,6 +171,7 @@ fn hover_literal(literal: &ast::Literal) -> Option { LitKind::ByteString(_) => format_bit_value_comment(&value, 16), LitKind::String(_) | LitKind::EscString(_) + | LitKind::NationalString(_) | LitKind::UnicodeEscString(_) | LitKind::DollarQuotedString(_) => match value.find('\n') { Some(idx) => { @@ -5604,6 +5606,19 @@ select 'foo$0'; "); } + #[test] + fn hover_national_string() { + assert_snapshot!(check_hover_info(r" +select N'fo$0o'; +").markdown(), @" + ```sql + text + ``` + --- + value of literal: ` foo ` + "); + } + #[test] fn hover_plain_string_escaped_quotes() { assert_snapshot!(check_hover_info(r" diff --git a/crates/squawk_ide/src/infer.rs b/crates/squawk_ide/src/infer.rs index 2381305a9..e9915a9c1 100644 --- a/crates/squawk_ide/src/infer.rs +++ b/crates/squawk_ide/src/infer.rs @@ -95,6 +95,7 @@ pub(crate) fn infer_type_from_literal(literal: &ast::Literal) -> Option { SyntaxKind::STRING | SyntaxKind::DOLLAR_QUOTED_STRING | SyntaxKind::ESC_STRING + | SyntaxKind::NATIONAL_STRING | SyntaxKind::UNICODE_ESC_STRING => Some(Type::Text), SyntaxKind::BIT_STRING | SyntaxKind::BYTE_STRING => Some(Type::Bit), SyntaxKind::TRUE_KW | SyntaxKind::FALSE_KW => Some(Type::Boolean), diff --git a/crates/squawk_ide/src/literals.rs b/crates/squawk_ide/src/literals.rs index 034a147d5..703430313 100644 --- a/crates/squawk_ide/src/literals.rs +++ b/crates/squawk_ide/src/literals.rs @@ -94,6 +94,10 @@ pub(crate) fn literal_string_value(literal: &ast::Literal) -> Option { out.push_str(inner); return Some(out); } + SyntaxKind::NATIONAL_STRING => { + let inner = strip_prefixed_quotes(token.text(), ['n', 'N'])?; + decode_plain_string(inner, &mut out); + } SyntaxKind::STRING => { let inner = strip_quotes(token.text())?; match decoding { diff --git a/crates/squawk_ide/src/tokens.rs b/crates/squawk_ide/src/tokens.rs index b50667860..7d95b589e 100644 --- a/crates/squawk_ide/src/tokens.rs +++ b/crates/squawk_ide/src/tokens.rs @@ -10,5 +10,6 @@ pub(crate) fn is_string_or_comment(kind: SyntaxKind) -> bool { | SyntaxKind::BIT_STRING | SyntaxKind::DOLLAR_QUOTED_STRING | SyntaxKind::ESC_STRING + | SyntaxKind::NATIONAL_STRING ) } diff --git a/crates/squawk_lexer/src/cursor.rs b/crates/squawk_lexer/src/cursor.rs index d3d1a8571..1d433bb07 100644 --- a/crates/squawk_lexer/src/cursor.rs +++ b/crates/squawk_lexer/src/cursor.rs @@ -43,6 +43,11 @@ impl<'a> Cursor<'a> { self.chars.as_str().is_empty() } + /// Returns a clone of the remaining chars, for cheap lookahead. + pub(crate) fn chars(&self) -> Chars<'a> { + self.chars.clone() + } + /// Returns amount of already consumed symbols. pub(crate) fn pos_within_token(&self) -> u32 { (self.len_remaining - self.chars.as_str().len()) as u32 diff --git a/crates/squawk_lexer/src/lib.rs b/crates/squawk_lexer/src/lib.rs index e4fc7466d..a02538ecf 100644 --- a/crates/squawk_lexer/src/lib.rs +++ b/crates/squawk_lexer/src/lib.rs @@ -87,7 +87,7 @@ impl Cursor<'_> { self.bump(); let terminated = self.single_quoted_string(false); TokenKind::Literal { - kind: LiteralKind::Str { terminated }, + kind: LiteralKind::NationalStr { terminated }, } } _ => self.ident(), @@ -125,8 +125,7 @@ impl Cursor<'_> { '?' => TokenKind::Question, ':' => TokenKind::Colon, '$' => { - // Dollar quoted strings - if is_ident_start(self.first()) || self.first() == '$' { + if self.is_dollar_quote_start() { self.dollar_quoted_string() } else { // Parameters @@ -162,7 +161,10 @@ impl Cursor<'_> { // Quoted indentifiers '"' => { let terminated = self.double_quoted_string(); - TokenKind::QuotedIdent { terminated } + TokenKind::QuotedIdent { + terminated, + uescape: false, + } } _ => TokenKind::Unknown, }; @@ -235,7 +237,10 @@ impl Cursor<'_> { '"' if allows_double => { self.bump(); let terminated = self.double_quoted_string(); - TokenKind::QuotedIdent { terminated } + TokenKind::QuotedIdent { + terminated, + uescape: true, + } } _ => self.ident(), } @@ -295,7 +300,10 @@ impl Cursor<'_> { }; match self.first() { - '.' => self.eat_fractional(), + '.' => { + self.bump(); + self.eat_fractional() + } 'e' | 'E' => { let exponent_start = self.pos_within_token(); self.bump(); @@ -373,6 +381,28 @@ impl Cursor<'_> { false } + /// Check for `$$` and `$tag$` + fn is_dollar_quote_start(&self) -> bool { + let mut chars = self.chars(); + match chars.next() { + // `$$...` -- empty tag + Some('$') => true, + // `$tag$...` -- tag chars terminated by `$` + Some(c) if is_ident_start(c) => { + for c in chars { + if c == '$' { + return true; + } + if !is_ident_cont(c) { + return false; + } + } + false + } + _ => false, + } + } + // https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING fn dollar_quoted_string(&mut self) -> TokenKind { // Get the start sequence of the dollar quote, i.e., 'foo' in @@ -487,11 +517,13 @@ impl Cursor<'_> { /// Eats the numeric exponent. Returns true if at least one digit was met, /// and returns false otherwise. fn eat_numeric_exponent(&mut self) -> bool { - if self.first() == '_' { - return false; - } if self.first() == '-' || self.first() == '+' { + if !self.second().is_ascii_digit() { + return false; + } self.bump(); + } else if !self.first().is_ascii_digit() { + return false; } self.eat_decimal_digits() } @@ -503,9 +535,6 @@ impl Cursor<'_> { } pub(crate) fn eat_fractional(&mut self) -> crate::LiteralKind { - // might have stuff after the ., and if it does, it needs to start - // with a number - self.bump(); let mut empty_exponent_start = None; if self.first().is_ascii_digit() { self.eat_decimal_digits(); @@ -715,6 +744,42 @@ $foo$hello$world$bar$ "#)) } + #[test] + fn numeric_leading_dot_with_separators() { + assert_debug_snapshot!(lex(".1_2 .5_5 .1_2e3"), @r#" + [ + ".1_2" @ Literal { kind: Numeric { empty_exponent_start: None, trailing_junk_start: 4 } }, + " " @ Whitespace, + ".5_5" @ Literal { kind: Numeric { empty_exponent_start: None, trailing_junk_start: 4 } }, + " " @ Whitespace, + ".1_2e3" @ Literal { kind: Numeric { empty_exponent_start: None, trailing_junk_start: 6 } }, + ] + "#) + } + + #[test] + fn numeric_exponent_underscore_after_sign() { + assert_debug_snapshot!(lex("1e+_2 1e-_2 1.0e+_2 .1e+_2"), @r#" + [ + "1e" @ Literal { kind: Numeric { empty_exponent_start: Some(1), trailing_junk_start: 2 } }, + "+" @ Plus, + "_2" @ Ident, + " " @ Whitespace, + "1e" @ Literal { kind: Numeric { empty_exponent_start: Some(1), trailing_junk_start: 2 } }, + "-" @ Minus, + "_2" @ Ident, + " " @ Whitespace, + "1.0e" @ Literal { kind: Numeric { empty_exponent_start: Some(3), trailing_junk_start: 4 } }, + "+" @ Plus, + "_2" @ Ident, + " " @ Whitespace, + ".1e" @ Literal { kind: Numeric { empty_exponent_start: Some(2), trailing_junk_start: 3 } }, + "+" @ Plus, + "_2" @ Ident, + ] + "#) + } + #[test] fn select_with_period() { assert_debug_snapshot!(lex(r#" @@ -736,9 +801,9 @@ x'1FF' fn national_character_string() { assert_debug_snapshot!(lex("N'foo' n'bar' numeric'1'"), @r#" [ - "N'foo'" @ Literal { kind: Str { terminated: true } }, + "N'foo'" @ Literal { kind: NationalStr { terminated: true } }, " " @ Whitespace, - "n'bar'" @ Literal { kind: Str { terminated: true } }, + "n'bar'" @ Literal { kind: NationalStr { terminated: true } }, " " @ Whitespace, "numeric" @ Ident, "'1'" @ Literal { kind: Str { terminated: true } }, @@ -902,6 +967,25 @@ U&"d!0061t!+000061" UESCAPE '!' "#); } + #[test] + fn unclosed_dollar_tag_does_not_swallow_rest_of_input() { + assert_debug_snapshot!(lex("select $x;\ndrop table users;"), @r#" + [ + "select" @ Ident, + " " @ Whitespace, + "$x" @ PositionalParam { trailing_junk_start: 1 }, + ";" @ Semi, + "\n" @ Whitespace, + "drop" @ Ident, + " " @ Whitespace, + "table" @ Ident, + " " @ Whitespace, + "users" @ Ident, + ";" @ Semi, + ] + "#); + } + #[test] fn ident_non_ascii_above_latin1() { assert_debug_snapshot!(lex("ẞ Δ€ ζΌ’ε­— 𐐷"), @r#" diff --git a/crates/squawk_lexer/src/snapshots/squawk_lexer__tests__quoted_ident.snap b/crates/squawk_lexer/src/snapshots/squawk_lexer__tests__quoted_ident.snap index 70f713420..8b683bf1e 100644 --- a/crates/squawk_lexer/src/snapshots/squawk_lexer__tests__quoted_ident.snap +++ b/crates/squawk_lexer/src/snapshots/squawk_lexer__tests__quoted_ident.snap @@ -4,8 +4,8 @@ expression: "lex(r#\"\n\"hello &1 -world\";\n\n\n\"hello-world\n\"#)" --- [ "\n" @ Whitespace, - "\"hello &1 -world\"" @ QuotedIdent { terminated: true }, + "\"hello &1 -world\"" @ QuotedIdent { terminated: true, uescape: false }, ";" @ Semi, "\n\n\n" @ Whitespace, - "\"hello-world\n" @ QuotedIdent { terminated: false }, + "\"hello-world\n" @ QuotedIdent { terminated: false, uescape: false }, ] diff --git a/crates/squawk_lexer/src/snapshots/squawk_lexer__tests__quoted_ident_with_escape_quote.snap b/crates/squawk_lexer/src/snapshots/squawk_lexer__tests__quoted_ident_with_escape_quote.snap index c5dc32d1c..e1ba85dd4 100644 --- a/crates/squawk_lexer/src/snapshots/squawk_lexer__tests__quoted_ident_with_escape_quote.snap +++ b/crates/squawk_lexer/src/snapshots/squawk_lexer__tests__quoted_ident_with_escape_quote.snap @@ -4,6 +4,6 @@ expression: "lex(r#\"\n\"foo \"\" bar\"\n\"#)" --- [ "\n" @ Whitespace, - "\"foo \"\" bar\"" @ QuotedIdent { terminated: true }, + "\"foo \"\" bar\"" @ QuotedIdent { terminated: true, uescape: false }, "\n" @ Whitespace, ] diff --git a/crates/squawk_lexer/src/snapshots/squawk_lexer__tests__string_unicode_escape.snap b/crates/squawk_lexer/src/snapshots/squawk_lexer__tests__string_unicode_escape.snap index dec48147a..b6a99d1b2 100644 --- a/crates/squawk_lexer/src/snapshots/squawk_lexer__tests__string_unicode_escape.snap +++ b/crates/squawk_lexer/src/snapshots/squawk_lexer__tests__string_unicode_escape.snap @@ -4,13 +4,13 @@ expression: "lex(r#\"\nU&\"d\\0061t\\+000061\"\n\nU&\"\\0441\\043B\\043E\\043D\" --- [ "\n" @ Whitespace, - "U&\"d\\0061t\\+000061\"" @ QuotedIdent { terminated: true }, + "U&\"d\\0061t\\+000061\"" @ QuotedIdent { terminated: true, uescape: true }, "\n\n" @ Whitespace, - "U&\"\\0441\\043B\\043E\\043D\"" @ QuotedIdent { terminated: true }, + "U&\"\\0441\\043B\\043E\\043D\"" @ QuotedIdent { terminated: true, uescape: true }, "\n\n" @ Whitespace, "u&'\\0441\\043B'" @ Literal { kind: UnicodeEscStr { terminated: true } }, "\n\n" @ Whitespace, - "U&\"d!0061t!+000061\"" @ QuotedIdent { terminated: true }, + "U&\"d!0061t!+000061\"" @ QuotedIdent { terminated: true, uescape: true }, " " @ Whitespace, "UESCAPE" @ Ident, " " @ Whitespace, diff --git a/crates/squawk_lexer/src/token.rs b/crates/squawk_lexer/src/token.rs index d4873a584..c058f136d 100644 --- a/crates/squawk_lexer/src/token.rs +++ b/crates/squawk_lexer/src/token.rs @@ -88,7 +88,7 @@ pub enum TokenKind { /// These are case-sensitive, unlike [`TokenKind::Ident`] /// /// see: - QuotedIdent { terminated: bool }, + QuotedIdent { terminated: bool, uescape: bool }, } /// Parsed token. @@ -143,6 +143,8 @@ pub enum LiteralKind { /// /// see: Str { terminated: bool }, + /// National character string, e.g., `N'foo'` + NationalStr { terminated: bool }, /// Hexidecimal Bit String, e.g., `X'1FF'` /// /// see: diff --git a/crates/squawk_parser/src/generated/syntax_kind.rs b/crates/squawk_parser/src/generated/syntax_kind.rs index dd1a9ad18..0c71c190a 100644 --- a/crates/squawk_parser/src/generated/syntax_kind.rs +++ b/crates/squawk_parser/src/generated/syntax_kind.rs @@ -555,6 +555,7 @@ pub enum SyntaxKind { DOLLAR_QUOTED_STRING, ESC_STRING, INT_NUMBER, + NATIONAL_STRING, NULL, NUMERIC_NUMBER, POSITIONAL_PARAM, diff --git a/crates/squawk_parser/src/grammar.rs b/crates/squawk_parser/src/grammar.rs index 809acf61f..beb8d6f12 100644 --- a/crates/squawk_parser/src/grammar.rs +++ b/crates/squawk_parser/src/grammar.rs @@ -43,7 +43,12 @@ fn literal(p: &mut Parser<'_>) -> Option { if p.eat(UESCAPE_KW) { p.expect(STRING); } - } else if p.eat(STRING) || p.eat(ESC_STRING) || p.eat(BIT_STRING) || p.eat(BYTE_STRING) { + } else if p.eat(NATIONAL_STRING) + || p.eat(STRING) + || p.eat(ESC_STRING) + || p.eat(BIT_STRING) + || p.eat(BYTE_STRING) + { while !p.at(EOF) && p.eat(STRING) {} } else { p.bump_any(); @@ -4882,6 +4887,7 @@ const STRING_FIRST: TokenSet = TokenSet::new(&[ BIT_STRING, DOLLAR_QUOTED_STRING, ESC_STRING, + NATIONAL_STRING, ]); // via https://www.postgresql.org/docs/17/sql-createoperator.html diff --git a/crates/squawk_parser/src/lexed_str.rs b/crates/squawk_parser/src/lexed_str.rs index 33e1fb806..27158c9f6 100644 --- a/crates/squawk_parser/src/lexed_str.rs +++ b/crates/squawk_parser/src/lexed_str.rs @@ -120,16 +120,15 @@ struct Converter<'a> { offset: usize, } -fn is_empty_quoted_ident(token_text: &str) -> bool { - let inner = if let Some(stripped) = token_text - .strip_prefix(['u', 'U']) - .and_then(|s| s.strip_prefix('&')) - { - stripped - } else { +fn is_empty_quoted_ident(token_text: &str, uescape: bool) -> bool { + let inner = if uescape { token_text + .strip_prefix(['u', 'U']) + .and_then(|s| s.strip_prefix('&')) + } else { + Some(token_text) }; - inner == "\"\"" + inner == Some("\"\"") } impl<'a> Converter<'a> { @@ -237,10 +236,13 @@ impl<'a> Converter<'a> { } SyntaxKind::POSITIONAL_PARAM } - squawk_lexer::TokenKind::QuotedIdent { terminated } => { + squawk_lexer::TokenKind::QuotedIdent { + terminated, + uescape, + } => { if !terminated { err = "Missing trailing \" to terminate the quoted identifier" - } else if is_empty_quoted_ident(token_text) { + } else if is_empty_quoted_ident(token_text, *uescape) { err = "empty delimited identifier"; } SyntaxKind::IDENT @@ -309,6 +311,15 @@ impl<'a> Converter<'a> { } SyntaxKind::STRING } + squawk_lexer::LiteralKind::NationalStr { terminated } => { + if !terminated { + err = Some( + "Missing trailing `'` symbol to terminate the national character string literal" + .into(), + ); + } + SyntaxKind::NATIONAL_STRING + } squawk_lexer::LiteralKind::ByteStr { terminated } => { if !terminated { err = Some( diff --git a/crates/squawk_parser/src/lib.rs b/crates/squawk_parser/src/lib.rs index 1e57d3f3f..ec2bcf520 100644 --- a/crates/squawk_parser/src/lib.rs +++ b/crates/squawk_parser/src/lib.rs @@ -173,6 +173,29 @@ enum TrivaBetween { Allowed, } +const OPERATOR_SIGN: TokenSet = TokenSet::new(&[SyntaxKind::PLUS, SyntaxKind::MINUS]); + +/// In order for an operator to end in `+` or `-`, it must contain one of the +/// following chars: +/// +/// ```sql +/// ~ ! @ # % ^ & | ` ? +/// ``` +/// +/// see: +const SPECIAL_OP_CHARS: TokenSet = TokenSet::new(&[ + SyntaxKind::TILDE, + SyntaxKind::BANG, + SyntaxKind::AT, + SyntaxKind::POUND, + SyntaxKind::PERCENT, + SyntaxKind::CARET, + SyntaxKind::AMP, + SyntaxKind::PIPE, + SyntaxKind::BACKTICK, + SyntaxKind::QUESTION, +]); + impl<'t> Parser<'t> { fn new(inp: &'t Input) -> Parser<'t> { Parser { @@ -437,16 +460,8 @@ impl<'t> Parser<'t> { } SyntaxKind::CUSTOM_OP => { let m = self.start(); - while !self.at(SyntaxKind::EOF) { - let is_joint = self.inp.is_joint(self.pos); - if self.at_ts(OPERATOR_FIRST) { - self.bump_any(); - } else { - break; - } - if !is_joint { - break; - } + for _ in 0..self.op_len() { + self.bump_any(); } m.complete(self, SyntaxKind::CUSTOM_OP); return true; @@ -493,16 +508,39 @@ impl<'t> Parser<'t> { } fn next_not_joined_op(&self, n: usize) -> bool { - let next = self.inp.kind(self.pos + n + 1); // next isn't an operator so we know we're not joined to it - if !OPERATOR_FIRST.contains(next) { + if !self.nth_at_ts(n + 1, OPERATOR_FIRST) { return true; } // current kind isn't joined if !self.inp.is_joint(self.pos + n) { return true; } - false + self.op_len() == n + 1 + } + + fn op_len(&self) -> usize { + if !self.at_ts(OPERATOR_FIRST) { + return 0; + } + + let mut len = 1; + let mut has_special = self.at_ts(SPECIAL_OP_CHARS); + while self.inp.is_joint(self.pos + len - 1) && self.nth_at_ts(len, OPERATOR_FIRST) { + has_special |= self.nth_at_ts(len, SPECIAL_OP_CHARS); + len += 1; + } + + // PostgreSQL skips trailing signs from ops if they don't contain a + // special char. + // This means `2*-3` parses as `2 * -3`. + if !has_special { + while len > 1 && self.nth_at_ts(len - 1, OPERATOR_SIGN) { + len -= 1; + } + } + + len } /// Checks if the current token is in `kinds`. @@ -870,8 +908,6 @@ impl<'t> Parser<'t> { /// token. #[must_use] fn nth(&self, n: usize) -> SyntaxKind { - assert!(n <= 3); - let steps = self.steps.get(); assert!( (steps as usize) < PARSER_STEP_LIMIT, diff --git a/crates/squawk_parser/tests/data/ok/operator_trailing_signs.sql b/crates/squawk_parser/tests/data/ok/operator_trailing_signs.sql new file mode 100644 index 000000000..b5e12ac34 --- /dev/null +++ b/crates/squawk_parser/tests/data/ok/operator_trailing_signs.sql @@ -0,0 +1,13 @@ +-- PostgreSQL splits trailing + or - from multi-character operators unless +-- the operator contains one of: ~ ! @ # % ^ & | ` ? +select 2*-3; +select 2/-3; +select 2+-3; +select 2<=-3; +select 2=-3; +select 2*+3; +select 2<=+3; +select 2++3; +select 2@-3; +select 2<@-3; +select 2!=-3; diff --git a/crates/squawk_parser/tests/snapshots/tests__operator_trailing_signs_ok.snap b/crates/squawk_parser/tests/snapshots/tests__operator_trailing_signs_ok.snap new file mode 100644 index 000000000..320c2ce16 --- /dev/null +++ b/crates/squawk_parser/tests/snapshots/tests__operator_trailing_signs_ok.snap @@ -0,0 +1,187 @@ +--- +source: crates/squawk_parser/tests/tests.rs +input_file: crates/squawk_parser/tests/data/ok/operator_trailing_signs.sql +--- +SOURCE_FILE + COMMENT "-- PostgreSQL splits trailing + or - from multi-character operators unless" + WHITESPACE "\n" + COMMENT "-- the operator contains one of: ~ ! @ # % ^ & | ` ?" + WHITESPACE "\n" + SELECT + SELECT_CLAUSE + SELECT_KW "select" + WHITESPACE " " + TARGET_LIST + TARGET + BIN_EXPR + LITERAL + INT_NUMBER "2" + STAR "*" + PREFIX_EXPR + MINUS "-" + LITERAL + INT_NUMBER "3" + SEMICOLON ";" + WHITESPACE "\n" + SELECT + SELECT_CLAUSE + SELECT_KW "select" + WHITESPACE " " + TARGET_LIST + TARGET + BIN_EXPR + LITERAL + INT_NUMBER "2" + SLASH "/" + PREFIX_EXPR + MINUS "-" + LITERAL + INT_NUMBER "3" + SEMICOLON ";" + WHITESPACE "\n" + SELECT + SELECT_CLAUSE + SELECT_KW "select" + WHITESPACE " " + TARGET_LIST + TARGET + BIN_EXPR + LITERAL + INT_NUMBER "2" + PLUS "+" + PREFIX_EXPR + MINUS "-" + LITERAL + INT_NUMBER "3" + SEMICOLON ";" + WHITESPACE "\n" + SELECT + SELECT_CLAUSE + SELECT_KW "select" + WHITESPACE " " + TARGET_LIST + TARGET + BIN_EXPR + LITERAL + INT_NUMBER "2" + LTEQ "<=" + PREFIX_EXPR + MINUS "-" + LITERAL + INT_NUMBER "3" + SEMICOLON ";" + WHITESPACE "\n" + SELECT + SELECT_CLAUSE + SELECT_KW "select" + WHITESPACE " " + TARGET_LIST + TARGET + BIN_EXPR + LITERAL + INT_NUMBER "2" + EQ "=" + PREFIX_EXPR + MINUS "-" + LITERAL + INT_NUMBER "3" + SEMICOLON ";" + WHITESPACE "\n" + SELECT + SELECT_CLAUSE + SELECT_KW "select" + WHITESPACE " " + TARGET_LIST + TARGET + BIN_EXPR + LITERAL + INT_NUMBER "2" + STAR "*" + PREFIX_EXPR + PLUS "+" + LITERAL + INT_NUMBER "3" + SEMICOLON ";" + WHITESPACE "\n" + SELECT + SELECT_CLAUSE + SELECT_KW "select" + WHITESPACE " " + TARGET_LIST + TARGET + BIN_EXPR + LITERAL + INT_NUMBER "2" + LTEQ "<=" + PREFIX_EXPR + PLUS "+" + LITERAL + INT_NUMBER "3" + SEMICOLON ";" + WHITESPACE "\n" + SELECT + SELECT_CLAUSE + SELECT_KW "select" + WHITESPACE " " + TARGET_LIST + TARGET + BIN_EXPR + LITERAL + INT_NUMBER "2" + PLUS "+" + PREFIX_EXPR + PLUS "+" + LITERAL + INT_NUMBER "3" + SEMICOLON ";" + WHITESPACE "\n" + SELECT + SELECT_CLAUSE + SELECT_KW "select" + WHITESPACE " " + TARGET_LIST + TARGET + BIN_EXPR + LITERAL + INT_NUMBER "2" + CUSTOM_OP + AT "@" + MINUS "-" + LITERAL + INT_NUMBER "3" + SEMICOLON ";" + WHITESPACE "\n" + SELECT + SELECT_CLAUSE + SELECT_KW "select" + WHITESPACE " " + TARGET_LIST + TARGET + BIN_EXPR + LITERAL + INT_NUMBER "2" + CUSTOM_OP + L_ANGLE "<" + AT "@" + MINUS "-" + LITERAL + INT_NUMBER "3" + SEMICOLON ";" + WHITESPACE "\n" + SELECT + SELECT_CLAUSE + SELECT_KW "select" + WHITESPACE " " + TARGET_LIST + TARGET + BIN_EXPR + LITERAL + INT_NUMBER "2" + CUSTOM_OP + BANG "!" + EQ "=" + MINUS "-" + LITERAL + INT_NUMBER "3" + SEMICOLON ";" + WHITESPACE "\n" diff --git a/crates/squawk_parser/tests/snapshots/tests__select_literal_err.snap b/crates/squawk_parser/tests/snapshots/tests__select_literal_err.snap index 690c40aa7..72080fd39 100644 --- a/crates/squawk_parser/tests/snapshots/tests__select_literal_err.snap +++ b/crates/squawk_parser/tests/snapshots/tests__select_literal_err.snap @@ -79,8 +79,12 @@ SOURCE_FILE WHITESPACE " " TARGET_LIST TARGET - LITERAL - NUMERIC_NUMBER "0.0e+a" + BIN_EXPR + LITERAL + NUMERIC_NUMBER "0.0e" + PLUS "+" + NAME_REF + IDENT "a" SEMICOLON ";" WHITESPACE "\n" SELECT diff --git a/crates/squawk_syntax/src/ast/node_ext.rs b/crates/squawk_syntax/src/ast/node_ext.rs index 0487d7d8f..5af2b6b1b 100644 --- a/crates/squawk_syntax/src/ast/node_ext.rs +++ b/crates/squawk_syntax/src/ast/node_ext.rs @@ -50,6 +50,7 @@ pub enum LitKind { EscString(SyntaxToken), False(SyntaxToken), IntNumber(SyntaxToken), + NationalString(SyntaxToken), Null(SyntaxToken), NumericNumber(SyntaxToken), PositionalParam(SyntaxToken), @@ -69,6 +70,7 @@ impl ast::Literal { SyntaxKind::ESC_STRING => LitKind::EscString(token), SyntaxKind::FALSE_KW => LitKind::False(token), SyntaxKind::INT_NUMBER => LitKind::IntNumber(token), + SyntaxKind::NATIONAL_STRING => LitKind::NationalString(token), SyntaxKind::NULL_KW => LitKind::Null(token), SyntaxKind::NUMERIC_NUMBER => LitKind::NumericNumber(token), SyntaxKind::POSITIONAL_PARAM => LitKind::PositionalParam(token), diff --git a/crates/squawk_syntax/src/postgresql.ungram b/crates/squawk_syntax/src/postgresql.ungram index bf1c76d8f..c87b6e1a9 100644 --- a/crates/squawk_syntax/src/postgresql.ungram +++ b/crates/squawk_syntax/src/postgresql.ungram @@ -362,6 +362,7 @@ Literal = | '@dollar_quoted_string' | '@unicode_esc_string' | '@esc_string' + | '@national_string' | '@positional_param' | 'default' | 'false' diff --git a/crates/squawk_syntax/src/snapshots/squawk_syntax__test__custom_operators_validation.snap b/crates/squawk_syntax/src/snapshots/squawk_syntax__test__custom_operators_validation.snap index c756e88be..5f3a914c1 100644 --- a/crates/squawk_syntax/src/snapshots/squawk_syntax__test__custom_operators_validation.snap +++ b/crates/squawk_syntax/src/snapshots/squawk_syntax__test__custom_operators_validation.snap @@ -267,69 +267,69 @@ SOURCE_FILE@0..423 INT_NUMBER@354..355 "1" WHITESPACE@355..356 " " CUSTOM_OP@356..419 - PLUS@356..357 "+" - PLUS@357..358 "+" - PLUS@358..359 "+" - PLUS@359..360 "+" - PLUS@360..361 "+" - PLUS@361..362 "+" - PLUS@362..363 "+" - PLUS@363..364 "+" - PLUS@364..365 "+" - PLUS@365..366 "+" - PLUS@366..367 "+" - PLUS@367..368 "+" - PLUS@368..369 "+" - PLUS@369..370 "+" - PLUS@370..371 "+" - PLUS@371..372 "+" - PLUS@372..373 "+" - PLUS@373..374 "+" - PLUS@374..375 "+" - PLUS@375..376 "+" - PLUS@376..377 "+" - PLUS@377..378 "+" - PLUS@378..379 "+" - PLUS@379..380 "+" - PLUS@380..381 "+" - PLUS@381..382 "+" - PLUS@382..383 "+" - PLUS@383..384 "+" - PLUS@384..385 "+" - PLUS@385..386 "+" - PLUS@386..387 "+" - PLUS@387..388 "+" - PLUS@388..389 "+" - PLUS@389..390 "+" - PLUS@390..391 "+" - PLUS@391..392 "+" - PLUS@392..393 "+" - PLUS@393..394 "+" - PLUS@394..395 "+" - PLUS@395..396 "+" - PLUS@396..397 "+" - PLUS@397..398 "+" - PLUS@398..399 "+" - PLUS@399..400 "+" - PLUS@400..401 "+" - PLUS@401..402 "+" - PLUS@402..403 "+" - PLUS@403..404 "+" - PLUS@404..405 "+" - PLUS@405..406 "+" - PLUS@406..407 "+" - PLUS@407..408 "+" - PLUS@408..409 "+" - PLUS@409..410 "+" - PLUS@410..411 "+" - PLUS@411..412 "+" - PLUS@412..413 "+" - PLUS@413..414 "+" - PLUS@414..415 "+" - PLUS@415..416 "+" - PLUS@416..417 "+" - PLUS@417..418 "+" - PLUS@418..419 "+" + PIPE@356..357 "|" + PIPE@357..358 "|" + PIPE@358..359 "|" + PIPE@359..360 "|" + PIPE@360..361 "|" + PIPE@361..362 "|" + PIPE@362..363 "|" + PIPE@363..364 "|" + PIPE@364..365 "|" + PIPE@365..366 "|" + PIPE@366..367 "|" + PIPE@367..368 "|" + PIPE@368..369 "|" + PIPE@369..370 "|" + PIPE@370..371 "|" + PIPE@371..372 "|" + PIPE@372..373 "|" + PIPE@373..374 "|" + PIPE@374..375 "|" + PIPE@375..376 "|" + PIPE@376..377 "|" + PIPE@377..378 "|" + PIPE@378..379 "|" + PIPE@379..380 "|" + PIPE@380..381 "|" + PIPE@381..382 "|" + PIPE@382..383 "|" + PIPE@383..384 "|" + PIPE@384..385 "|" + PIPE@385..386 "|" + PIPE@386..387 "|" + PIPE@387..388 "|" + PIPE@388..389 "|" + PIPE@389..390 "|" + PIPE@390..391 "|" + PIPE@391..392 "|" + PIPE@392..393 "|" + PIPE@393..394 "|" + PIPE@394..395 "|" + PIPE@395..396 "|" + PIPE@396..397 "|" + PIPE@397..398 "|" + PIPE@398..399 "|" + PIPE@399..400 "|" + PIPE@400..401 "|" + PIPE@401..402 "|" + PIPE@402..403 "|" + PIPE@403..404 "|" + PIPE@404..405 "|" + PIPE@405..406 "|" + PIPE@406..407 "|" + PIPE@407..408 "|" + PIPE@408..409 "|" + PIPE@409..410 "|" + PIPE@410..411 "|" + PIPE@411..412 "|" + PIPE@412..413 "|" + PIPE@413..414 "|" + PIPE@414..415 "|" + PIPE@415..416 "|" + PIPE@416..417 "|" + PIPE@417..418 "|" + PIPE@418..419 "|" WHITESPACE@419..420 " " LITERAL@420..421 INT_NUMBER@420..421 "2" diff --git a/crates/squawk_syntax/src/snapshots/squawk_syntax__test__dollar_quoted_string_validation.snap b/crates/squawk_syntax/src/snapshots/squawk_syntax__test__dollar_quoted_string_validation.snap index 4ce707fde..f6a1e76cd 100644 --- a/crates/squawk_syntax/src/snapshots/squawk_syntax__test__dollar_quoted_string_validation.snap +++ b/crates/squawk_syntax/src/snapshots/squawk_syntax__test__dollar_quoted_string_validation.snap @@ -9,16 +9,20 @@ SOURCE_FILE@0..30 WHITESPACE@6..7 " " TARGET_LIST@7..28 TARGET@7..28 - LITERAL@7..28 - DOLLAR_QUOTED_STRING@7..28 "$foo-bar$abc$foo-bar$" + BIN_EXPR@7..28 + BIN_EXPR@7..23 + LITERAL@7..11 + POSITIONAL_PARAM@7..11 "$foo" + MINUS@11..12 "-" + NAME_REF@12..23 + IDENT@12..23 "bar$abc$foo" + MINUS@23..24 "-" + NAME_REF@24..28 + IDENT@24..28 "bar$" SEMICOLON@28..29 ";" WHITESPACE@29..30 "\n" -error[syntax-error]: "-" is not allowed in dollar quote tags +error[syntax-error]: missing parameter number β•­β–Έ 1 β”‚ select $foo-bar$abc$foo-bar$; - β•°β•΄ ━ -error[syntax-error]: "-" is not allowed in dollar quote tags - β•­β–Έ -1 β”‚ select $foo-bar$abc$foo-bar$; - β•°β•΄ ━ + β•°β•΄ ━ diff --git a/crates/squawk_syntax/src/validation.rs b/crates/squawk_syntax/src/validation.rs index 7d35505da..2a17b1491 100644 --- a/crates/squawk_syntax/src/validation.rs +++ b/crates/squawk_syntax/src/validation.rs @@ -7,7 +7,6 @@ use std::ops::Range; use crate::ast::{AstNode, LitKind}; -use crate::quote::{dollar_quote_tag, strip_dollar_quotes}; use crate::unescape::{escape_unicode_esc_str, uescape_char}; use crate::{SyntaxNode, SyntaxToken, ast, match_ast, syntax_error::SyntaxError}; use rowan::{TextRange, TextSize}; @@ -208,7 +207,12 @@ fn validate_literal(lit: ast::Literal, acc: &mut Vec) { LookingFor::OpenString => { if matches!( token.kind(), - STRING | ESC_STRING | BIT_STRING | BYTE_STRING | UNICODE_ESC_STRING + STRING + | ESC_STRING + | BIT_STRING + | BYTE_STRING + | UNICODE_ESC_STRING + | NATIONAL_STRING ) { state = LookingFor::CloseString(token.text_range().end(), false); } @@ -249,51 +253,9 @@ fn validate_literal(lit: ast::Literal, acc: &mut Vec) { validate_unicode_esc_string(&lit, acc); validate_prefixed_strings(&lit, acc); - validate_dollar_quoted_string(&lit, acc); validate_default_literal(&lit, acc); } -fn validate_dollar_quoted_string(lit: &ast::Literal, acc: &mut Vec) { - let Some(LitKind::DollarQuotedString(token)) = lit.kind() else { - return; - }; - let text = token.text(); - let Some(tag) = dollar_quote_tag(text) else { - return; - }; - let closing_tag_start = strip_dollar_quotes(text).map(|_| text.len() - tag.len() - 1); - let token_start = token.text_range().start(); - for tag_start in [Some(1), closing_tag_start].into_iter().flatten() { - validate_dollar_quote_tag(tag, token_start + TextSize::new(tag_start as u32), acc); - } -} - -// dolq_start [A-Za-z\200-\377_] -const fn is_dollar_quote_tag_start(c: char) -> bool { - matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '\u{80}'..) -} - -// dolq_cont [A-Za-z\200-\377_0-9] -const fn is_dollar_quote_tag_cont(c: char) -> bool { - matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' | '\u{80}'..) -} - -fn validate_dollar_quote_tag(tag: &str, tag_start: TextSize, acc: &mut Vec) { - for (i, c) in tag.char_indices() { - let is_valid = if i == 0 { - is_dollar_quote_tag_start(c) - } else { - is_dollar_quote_tag_cont(c) - }; - if !is_valid { - acc.push(SyntaxError::new( - format!(r#""{c}" is not allowed in dollar quote tags"#), - offset_range(tag_start, i..i + c.len_utf8()), - )); - } - } -} - fn validate_default_literal(lit: &ast::Literal, acc: &mut Vec) { if !matches!(lit.kind(), Some(LitKind::Default(_))) { return; diff --git a/crates/squawk_syntax/test_data/validation/custom_operators.sql b/crates/squawk_syntax/test_data/validation/custom_operators.sql index 93c114a08..84830e5b5 100644 --- a/crates/squawk_syntax/test_data/validation/custom_operators.sql +++ b/crates/squawk_syntax/test_data/validation/custom_operators.sql @@ -10,4 +10,4 @@ select ^m; select |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 1; select 1 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 2; -- 63 chars is still allowed -select 1 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2; +select 1 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 2;