From 30d64d9b65a94bd351023d056b1398542f2f7367 Mon Sep 17 00:00:00 2001 From: erick-alcachofa Date: Fri, 26 Dec 2025 00:28:18 -0600 Subject: [PATCH] fix(parser): support empty blocks, support nested scoping, and refine loop lookahead Signed-off-by: erick-alcachofa This commit addresses several critical issues in the recursive descent parser, specifically regarding the handling of empty constructs, statement termination, and AST representation of nested scopes. These changes bring the implementation in line with the Artichoke EBNF specification. * **CodeBlock as Statement:** Added `CodeBlockStmtNode` to the `StatementNode` variant. This allows a bare `{}` to be treated as a valid statement, enabling manual scoping within functions. * **Visitor Support:** Updated `toDot.cpp` (Graphviz) and `toString.cpp` (Pretty-print) to support the new `CodeBlockStmtNode` during AST traversal. * **Empty Member Lists:** Implemented a pre-loop check for the closing brace `}` in `parseStruct` and `parseEnum`. This prevents the parser from attempting to parse members in empty declarations (e.g., `struct Empty {}`). * **Diagnostic Accuracy:** Enhanced the member-parsing loop to provide better error context. If a member is not followed by a comma or a closing brace, the parser now explicitly suggests `',' or '}'` as the expected tokens. * **Nested Scopes:** The parser now correctly identifies a `{` at the start of a statement and dispatches to `parseCodeBlock`. * **Empty Code Blocks:** Added a guard in the block-parsing loop to check for `}` immediately after `{`, allowing functions or nested scopes to be empty. * **C-Style For-Loops:** Replaced `match` with `matchAndConsume` for the initialization semicolon. This allows the parser to correctly handle loops where the initialization is omitted (e.g., `for (; 1; 1)`). * **Correctness:** Resolves parser hangs or errors when encountering empty blocks. * **Compliance:** Fully supports the EBNF definition of zero-or-more members/statements. * **Visuals:** AST diagrams now accurately reflect nested block structures. --- .../artichoke/Parser/AST/Statements.hpp | 3 +- lib/src/Parser/AST/toDot.cpp | 1 + lib/src/Parser/AST/toString.cpp | 3 + lib/src/Parser/Declarations.cpp | 60 ++++++++++++++----- lib/src/Parser/Statements.cpp | 37 +++++++++++- 5 files changed, 88 insertions(+), 16 deletions(-) diff --git a/lib/include/artichoke/Parser/AST/Statements.hpp b/lib/include/artichoke/Parser/AST/Statements.hpp index f0b50a4..adfc6d1 100644 --- a/lib/include/artichoke/Parser/AST/Statements.hpp +++ b/lib/include/artichoke/Parser/AST/Statements.hpp @@ -93,7 +93,8 @@ namespace arti::lang::ast { WhileStmtNode, DoWhileStmtNode, InfLoopStmtNode, - ExpressionStmtNode + ExpressionStmtNode, + CodeBlockStmtNode >; using ElseBranchNode = Variant< diff --git a/lib/src/Parser/AST/toDot.cpp b/lib/src/Parser/AST/toDot.cpp index 811b923..9e5eb19 100644 --- a/lib/src/Parser/AST/toDot.cpp +++ b/lib/src/Parser/AST/toDot.cpp @@ -1002,6 +1002,7 @@ namespace arti::lang::ast { [&g](const DoWhileStmtNode &n) { return emit(n, g); }, [&g](const InfLoopStmtNode &n) { return emit(n, g); }, [&g](const ExpressionStmtNode &n) { return emit(n, g); }, + [&g](const CodeBlockStmtNode &n) { return emit(n, g); }, }; return std::visit(visitor, node); } diff --git a/lib/src/Parser/AST/toString.cpp b/lib/src/Parser/AST/toString.cpp index c86b264..9617e30 100644 --- a/lib/src/Parser/AST/toString.cpp +++ b/lib/src/Parser/AST/toString.cpp @@ -1419,6 +1419,9 @@ namespace arti::lang::ast { [padding](const ExpressionStmtNode &node) -> std::string { return toString(node, padding); }, + [padding](const CodeBlockStmtNode &node) -> std::string { + return toString(node, padding); + } }; return std::visit(visitor, node); diff --git a/lib/src/Parser/Declarations.cpp b/lib/src/Parser/Declarations.cpp index 00b4e79..fa33f69 100644 --- a/lib/src/Parser/Declarations.cpp +++ b/lib/src/Parser/Declarations.cpp @@ -468,6 +468,13 @@ namespace arti::lang { bool keepParsing = true; + if (auto close = match(TokenV::opRSquirly); ! close) { + return Unexpected{ std::move(close).error() }; + } + else if (close.value()) { + keepParsing = false; + } + while (keepParsing) { if (auto member = parseStructMember(); ! member) { return Unexpected{ std::move(member).error() }; @@ -479,13 +486,22 @@ namespace arti::lang { if (auto comma = matchAndConsume(TokenV::opComma); ! comma) { return Unexpected<>{ std::move(comma).error() }; } - - if (auto peekToken = tokenizer.peek(); ! peekToken) { - return Unexpected{ std::move(peekToken).error() }; - } - else { - if (peekToken->value == TokenV::opRSquirly) { - keepParsing = false; + else if (! comma.value()) { + if (auto peekToken = tokenizer.peek(); ! peekToken) { + return Unexpected{ std::move(peekToken).error() }; + } + else { + if (peekToken->value != TokenV::opRSquirly) { + return langException( + peekToken->line, + peekToken->column, + toString(*peekToken), + "',' or '}'" + ); + } + else { + keepParsing = false; + } } } } @@ -533,6 +549,13 @@ namespace arti::lang { bool keepParsing = true; + if (auto close = match(TokenV::opRSquirly); ! close) { + return Unexpected{ std::move(close).error() }; + } + else if (close.value()) { + keepParsing = false; + } + while (keepParsing) { if (auto member = parseEnumMember(); ! member) { return Unexpected{ std::move(member).error() }; @@ -544,13 +567,22 @@ namespace arti::lang { if (auto comma = matchAndConsume(TokenV::opComma); ! comma) { return Unexpected<>{ std::move(comma).error() }; } - - if (auto peekToken = tokenizer.peek(); ! peekToken) { - return Unexpected{ std::move(peekToken).error() }; - } - else { - if (peekToken->value == TokenV::opRSquirly) { - keepParsing = false; + else if (! comma.value()) { + if (auto peekToken = tokenizer.peek(); ! peekToken) { + return Unexpected{ std::move(peekToken).error() }; + } + else { + if (peekToken->value != TokenV::opRSquirly) { + return langException( + peekToken->line, + peekToken->column, + toString(*peekToken), + "',' or '}'" + ); + } + else { + keepParsing = false; + } } } } diff --git a/lib/src/Parser/Statements.cpp b/lib/src/Parser/Statements.cpp index dcaddf4..eab6553 100644 --- a/lib/src/Parser/Statements.cpp +++ b/lib/src/Parser/Statements.cpp @@ -36,6 +36,13 @@ namespace arti::lang { return Unexpected<>{ std::move(lsquirly).error() }; } + if (auto close = match(TokenV::opRSquirly); ! close) { + return Unexpected<>{ std::move(close).error() }; + } + else if (close.value()) { + keepParsing = false; + } + while (keepParsing) { if (auto ok = parseStatement(); ! ok) { return Unexpected<>{ std::move(ok).error() }; @@ -50,6 +57,13 @@ namespace arti::lang { node->statements.push_back(std::move(stmt).value()); } } + + if (auto close = match(TokenV::opRSquirly); ! close) { + return Unexpected<>{ std::move(close).error() }; + } + else if (close.value()) { + keepParsing = false; + } } if (auto rsquirly = consume(TokenV::opRSquirly, "'}'"); ! rsquirly) { @@ -300,6 +314,23 @@ namespace arti::lang { return ast::StatementNode{ std::move(stmt).value() }; } } + else if (peekToken->value == TokenV::opLSquirly) { + if (label.has_value()) { + return langException( + peekToken->line, + peekToken->column, + toString(*peekToken), + "loop keyword, i.e. any of ( for, while, do, loop )" + ); + } + + if (auto stmt = parseCodeBlock(); ! stmt) { + return Unexpected<>{ std::move(stmt).error() }; + } + else { + return ast::StatementNode{ std::move(stmt).value() }; + } + } else { if (label.has_value()) { return langException( @@ -981,7 +1012,7 @@ namespace arti::lang { return Unexpected<>{ std::move(lParen).error() }; } - if (auto skipPre = match(TokenV::opSemicolon); ! skipPre) { + if (auto skipPre = matchAndConsume(TokenV::opSemicolon); ! skipPre) { return Unexpected<>{ std::move(skipPre).error() }; } else if (not skipPre.value()) { @@ -1234,6 +1265,10 @@ namespace arti::lang { return Unexpected<>{ std::move(rParen).error() }; } + if (auto rParen = consume(TokenV::opSemicolon, "';'"); ! rParen) { + return Unexpected<>{ std::move(rParen).error() }; + } + return node; }