refactor(parser): overhaul parsing logic and enhance error reporting

Signed-off-by: erick-alcachofa <erick@artichoke.dev>

Major refactoring of the Parser and Tokenizer components to improve code
maintainability, strengthen error messaging, and streamline AST
generation.

This version intentionally focuses on top-level declarations, with
statement parsing stubbed for the next development phase.

- **Path Sanitization**: Added `sanitizePath` to extract filenames from
  input paths, ensuring consistent `unitName` identification regardless
  of directory depth.
- **Improved Output**: Wrapped AST string output in Markdown code blocks
  and added a commented-out entry for the new DOT graph visualization.

- **Unified Consumption**: Replaced manual token checks with a more
  robust `consume()` method that leverages `peekExpect()` for
  centralized error handling.
- **New Predicates**: Introduced `match()` and `matchAndConsume()`
  helpers to handle optional tokens and branching logic without
  redundant peek/consume calls.
- **Exception Handling**: Standardized the use of `langException` across
  all parsing functions, providing more descriptive "Expected X, found
  Y" messages.

- **Declarations**: Refactored `parseTopLevelDeclaration` and
  sub-parsers (Module, Struct, Enum, Fn) to use the new matching
  patterns.
- **Looping Logic**: Replaced recursive-style parsing loops with
  `while(keepParsing)` iterative blocks to prevent stack depth issues
  and clarify termination conditions (e.g., finding a closing brace or
  failing to find a comma).
- **Namespaced Identifiers**: Rewrote `parseNamespacedIdentifier` to
  correctly handle multi-part paths (`A::B::C`) and edge cases.
- **Generic Support**: Improved handling of generic parameter and
  argument lists, ensuring strict enforcement of delimiters like `<` and
  `>`.

- **Contextual Errors**: Updated `peekExpect` to accept a custom
  `message` string, allowing the parser to describe *what* it was
  looking for (e.g., "Expected ';'").
- **Token Lookahead**: Enhanced `peek` and `peekExpect` reliability with
  better bounds checking and buffer management.

- **Removed `lib/src/Parser/AST/AST.cpp`**: Deleted the monolithic AST
  stringification file in favor of the previously introduced modular
  implementations.
- **Build System**: Updated `.gitignore` to ignore
  `cpm-package-lock.cmake`.
This commit is contained in:
erick-alcachofa 2025-12-25 11:41:08 -06:00
parent e2fa44738f
commit 8911702c0d
Signed by: me
GPG Key ID: 6FA5F8643444BAFA
10 changed files with 581 additions and 2565 deletions

2
.gitignore vendored
View File

@ -3,4 +3,6 @@
build/**
install/**
cpm-package-lock.cmake
TODO.md

View File

@ -1,8 +1,15 @@
#include <print>
#include <fstream>
#include <filesystem>
#include <artichoke/Parser/Parser.hpp>
std::string sanitizePath(std::string_view path) {
namespace fs = std::filesystem;
fs::path p{ path };
return p.filename().string();
}
int main(int argc, char **argv) {
using namespace arti::lang;
@ -12,7 +19,7 @@ int main(int argc, char **argv) {
}
std::ifstream file;
file.open(argv[1]);
file.open(sanitizePath(argv[1]));
if (! file.is_open()) {
std::println("Failed to open file {}", argv[1]);
@ -22,7 +29,7 @@ int main(int argc, char **argv) {
std::string buffer{ std::istreambuf_iterator<char>(file),
std::istreambuf_iterator<char>() };
auto parser = Parser{ buffer };
auto parser = Parser{ sanitizePath(argv[1]), buffer };
auto res = parser.parse();
@ -40,5 +47,10 @@ int main(int argc, char **argv) {
auto ast = std::move(res).value();
std::println("# AST");
std::println("```markdown");
std::println("{}", ast::toString(ast));
std::println("```");
// std::println("{}", ast::toDot(ast));
}

View File

@ -126,22 +126,13 @@ namespace arti::lang {
Expected<ast::InfLoopStmtNode>
parseInfLoopStatement();
Expected<Token> consume(TokenV type, std::string_view expected_name) {
auto peeked = tokenizer.peek();
Expected<Token> consume(TokenV type, std::string_view message) {
auto peeked = tokenizer.peekExpect(type, message);
if (! peeked) {
return Unexpected<>{ std::move(peeked).error() };
}
if (peeked->value != type) {
return langException<ExceptCode::ecUnexpectedToken>(
peeked->line,
peeked->column,
toString(*peeked),
expected_name
);
}
std::ignore = tokenizer.consume();
return peeked;
@ -163,6 +154,16 @@ namespace arti::lang {
return true;
}
Expected<bool> match(TokenV type, std::size_t offset = 0) {
auto peeked = tokenizer.peek(offset);
if (! peeked) {
return Unexpected<>{ std::move(peeked).error() };
}
return (peeked->value == type);
}
private:
std::string unitName;
std::string sourceCode;

View File

@ -25,7 +25,11 @@ namespace arti::lang {
Expected<void> consume(std::size_t n = 1) noexcept;
Expected<Token> peek(std::size_t n = 0) noexcept;
Expected<Token> peekExpect(std::size_t n, TokenV tokenType) noexcept;
Expected<Token> peekExpect(
TokenV tokenType,
std::string_view message = "",
std::size_t n = 0
) noexcept;
bool finished() const noexcept;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -14,30 +14,28 @@ namespace arti::lang {
Expected<ast::AST> Parser::parse() {
auto unit = ast::MakeNode<ast::AST>();
auto tlDecl = ast::Optional<ast::TopLevelDeclNode>{};
auto decl = ast::Optional<ast::TopLevelDeclNode>{};
bool keepParsing = true;
unit->unitName = this->unitName;
while (keepParsing) {
if (auto ok = parseTopLevelDeclaration(); ok) {
tlDecl = std::move(ok).value();
if (auto ok = parseTopLevelDeclaration(); ! ok) {
return Unexpected<>{ std::move(ok).error() };
}
else {
decl = std::move(ok).value();
if (! tlDecl.has_value()) {
if (! decl.has_value()) {
keepParsing = false;
}
else {
unit->declarations.push_back(std::move(tlDecl).value());
unit->declarations.push_back(std::move(decl).value());
}
}
else {
return Unexpected<>{ std::move(ok).error() };
}
}
auto eof = tokenizer.peekExpect(0, TokenV::tkEOF);
if (! eof) {
if (auto eof = consume(TokenV::tkEOF, "end of compilation unit"); ! eof) {
return Unexpected<>{ std::move(eof).error() };
}

View File

@ -8,6 +8,10 @@ namespace arti::lang {
auto stmt = ast::Optional<ast::StatementNode>{};
bool keepParsing = true;
if (auto lsquirly = consume(TokenV::opLSquirly, "'{'"); ! lsquirly) {
return Unexpected<>{ std::move(lsquirly).error() };
}
while (keepParsing) {
if (auto ok = parseStatement(); ok) {
stmt = std::move(ok).value();
@ -24,12 +28,20 @@ namespace arti::lang {
}
}
if (auto rsquirly = consume(TokenV::opRSquirly, "'}'"); ! rsquirly) {
return Unexpected<>{ std::move(rsquirly).error() };
}
return node;
}
Expected<ast::Optional<ast::StatementNode>>
Parser::parseStatement() {
/* TODO: Implement statement parsing logic.
* This is intentionally stubbed while the parser architecture
* is being in development.
* Currently, the compiler is in a 'Declarations-Only' state. */
return std::nullopt;
}

View File

@ -1,138 +1,130 @@
#include <artichoke/Parser/Parser.hpp>
#include <print>
namespace arti::lang {
Expected<ast::NamespacedIdentifierNode> Parser::parseNamespacedIdentifier() {
auto node = ast::MakeNode<ast::NamespacedIdentifierNode>();
auto ident = tokenizer.peekExpect(0, TokenV::tkIdentifier);
bool keepParsing = true;
if (! ident) {
return Unexpected<>{ std::move(ident).error() };
}
else {
node->location = { .line = ident->line, .column = ident->column };
node->identParts.emplace_back(ident->strValue);
}
std::ignore = tokenizer.consume();
auto peekNext = tokenizer.peek();
if (! peekNext) {
return Unexpected<>{ std::move(peekNext).error() };
}
while (peekNext->value == TokenV::opAccess) {
ident = tokenizer.peek(1);
if (! ident) {
return node;
while (keepParsing) {
if (auto ident = match(TokenV::tkIdentifier); ! ident) {
return Unexpected<>{ std::move(ident).error() };
}
else if (ident.value()) {
if (auto ident = consume(TokenV::tkIdentifier, "identifier"); ! ident) {
return Unexpected<>{ std::move(ident).error() };
}
else {
node->location = { .line = ident->line, .column = ident->column };
node->identParts.emplace_back(ident->strValue);
}
}
else {
if (ident->value != TokenV::tkIdentifier) {
return node;
}
node->identParts.emplace_back(ident->strValue);
return node;
}
std::ignore = tokenizer.consume(2);
peekNext = tokenizer.peek();
if (! peekNext) {
return Unexpected<>{ std::move(peekNext).error() };
if (auto access = match(TokenV::opAccess); ! access) {
return Unexpected<>{ std::move(access).error() };
}
else if (access.value()) {
if (auto ident = match(TokenV::tkIdentifier, 1); ! ident) {
return Unexpected<>{ std::move(ident).error() };
}
else if (not ident.value()) {
keepParsing = false;
}
else {
if (auto colon = consume(TokenV::opAccess, "':'"); ! colon) {
return Unexpected<>{ std::move(colon).error() };
}
}
}
else {
keepParsing = false;
}
}
return node;
}
Expected<ast::TypeNode> Parser::parseType() {
auto node = ast::MakeNode<ast::TypeNode>();
auto currentNode = ast::TypeExpressionNode{};
if (auto peekNext = tokenizer.peek(); ! peekNext) {
return Unexpected<>{ std::move(peekNext).error() };
if (auto nextToken = tokenizer.peek(); ! nextToken) {
return Unexpected<>{ std::move(nextToken).error() };
}
else {
node->location = { .line = peekNext->line, .column = peekNext->column };
if (peekNext->value != TokenV::tkIdentifier) {
auto qualifiers = parseTypeQualifiers();
if (! qualifiers) {
return Unexpected<>{ std::move(qualifiers).error() };
}
node->location = { .line = nextToken->line, .column = nextToken->column };
}
if (auto ident = match(TokenV::tkIdentifier); ! ident) {
return Unexpected<>{ std::move(ident).error() };
}
else if (not ident.value()) {
if (auto qualifiers = parseTypeQualifiers(); ! qualifiers) {
return Unexpected<>{ std::move(qualifiers).error() };
}
else {
node->qualifiers = std::move(qualifiers).value();
}
}
auto identType = parseNamespacedIdentifier();
if (! identType) {
if (auto identType = parseNamespacedIdentifier(); ! identType) {
return Unexpected<>{ std::move(identType).error() };
}
else {
currentNode = ast::MakeNode<ast::IdentifierTypeNode>();
auto currentNode = ast::TypeExpressionNode{};
std::get<ast::IdentifierTypeNode>(currentNode)->location =
(*identType)->location;
currentNode = ast::MakeNode<ast::IdentifierTypeNode>();
std::get<ast::IdentifierTypeNode>(currentNode)->location =
(*identType)->location;
std::get<ast::IdentifierTypeNode>(currentNode)->typeName =
std::move(identType).value();
auto peekNext = tokenizer.peek();
if (! peekNext) {
return Unexpected<>{ std::move(peekNext).error() };
std::get<ast::IdentifierTypeNode>(currentNode)->typeName =
std::move(identType).value();
}
if (peekNext->value == TokenV::opLt) {
std::ignore = tokenizer.consume();
auto args = parseGenericArgumentsList();
if (! args) {
return Unexpected<>{ std::move(args).error() };
if (auto lt = matchAndConsume(TokenV::opLt); ! lt) {
return Unexpected<>{ std::move(lt).error() };
}
else if (lt.value()) {
if (auto genericArgs = parseGenericArgumentsList(); ! genericArgs) {
return Unexpected<>{ std::move(genericArgs).error() };
}
else {
auto genParamsNode = ast::MakeNode<ast::GenericTypeNode>();
if (auto closeG = tokenizer.peekExpect(0, TokenV::opGt); ! closeG) {
return Unexpected<>{ std::move(closeG).error() };
}
std::ignore = tokenizer.consume();
genParamsNode->location = std::visit(
[](const auto &node) { return node->location; },
currentNode
);
auto newNode = ast::MakeNode<ast::GenericTypeNode>();
genParamsNode->baseType = std::move(currentNode);
genParamsNode->genericArgs = std::move(genericArgs).value();
currentNode = std::move(genParamsNode);
newNode->location = std::visit(
[](const auto &node) { return node->location; },
currentNode
);
newNode->baseType = std::move(currentNode);
newNode->genericArgs = std::move(args).value();
currentNode = std::move(newNode);
peekNext = tokenizer.peek();
if (! peekNext) {
return Unexpected<>{ std::move(peekNext).error() };
if (auto gt = consume(TokenV::opGt, "'>'"); ! gt) {
return Unexpected<>{ std::move(gt).error() };
}
}
}
while (peekNext->value == TokenV::opAccess) {
std::ignore = tokenizer.consume();
bool keepParsing = false;
auto ident = tokenizer.peekExpect(0, TokenV::tkIdentifier);
if (auto access = matchAndConsume(TokenV::opAccess); ! access) {
return Unexpected<>{ std::move(access).error() };
}
else if (access.value()) {
keepParsing = true;
}
if (! ident) {
while (keepParsing) {
if (auto ident = consume(TokenV::tkIdentifier, "identifier"); ! ident) {
return Unexpected<>{ std::move(ident).error() };
}
else {
std::ignore = tokenizer.consume();
auto newNode = ast::MakeNode<ast::NamespacedTypeNode>();
newNode->location = std::visit(
@ -143,44 +135,38 @@ namespace arti::lang {
newNode->typeName = ident->strValue;
newNode->baseType = std::move(currentNode);
currentNode = std::move(newNode);
}
peekNext = tokenizer.peek();
if (auto lt = matchAndConsume(TokenV::opLt); ! lt) {
return Unexpected<>{ std::move(lt).error() };
}
else if (lt.value()) {
if (auto genericArgs = parseGenericArgumentsList(); ! genericArgs) {
return Unexpected<>{ std::move(genericArgs).error() };
}
else {
auto genParamsNode = ast::MakeNode<ast::GenericTypeNode>();
if (! peekNext) {
return Unexpected<>{ std::move(peekNext).error() };
genParamsNode->location = std::visit(
[](const auto &node) { return node->location; },
currentNode
);
genParamsNode->baseType = std::move(currentNode);
genParamsNode->genericArgs = std::move(genericArgs).value();
currentNode = std::move(genParamsNode);
if (auto gt = consume(TokenV::opGt, "'>'"); ! gt) {
return Unexpected<>{ std::move(gt).error() };
}
}
}
if (peekNext->value == TokenV::opLt) {
std::ignore = tokenizer.consume();
auto args = parseGenericArgumentsList();
if (! args) {
return Unexpected<>{ std::move(args).error() };
}
if (auto closeG = tokenizer.peekExpect(0, TokenV::opGt); ! closeG) {
return Unexpected<>{ std::move(closeG).error() };
}
std::ignore = tokenizer.consume();
auto newNode = ast::MakeNode<ast::GenericTypeNode>();
newNode->location = std::visit(
[](const auto &node) { return node->location; },
currentNode
);
newNode->baseType = std::move(currentNode);
newNode->genericArgs = std::move(args).value();
currentNode = std::move(newNode);
peekNext = tokenizer.peek();
if (! peekNext) {
return Unexpected<>{ std::move(peekNext).error() };
}
if (auto access = matchAndConsume(TokenV::opAccess); ! access) {
return Unexpected<>{ std::move(access).error() };
}
else {
keepParsing = access.value();
}
}
@ -235,7 +221,7 @@ namespace arti::lang {
case opLBracket:
std::ignore = tokenizer.consume();
peekToken = tokenizer.peekExpect(0, opRBracket);
peekToken = tokenizer.peekExpect(opRBracket);
if (! peekToken) {
return Unexpected<>{ std::move(peekToken).error() };
@ -269,29 +255,46 @@ namespace arti::lang {
return Unexpected{ std::move(peekToken).error() };
}
while (peekToken->value != TokenV::opGt) {
auto type = parseType();
bool keepParsing = true;
if (! type) {
if (auto comma = tokenizer.peek();
comma and comma->value == TokenV::opComma) {
return langException<ExceptCode::ecUnexpectedToken>(
comma->line,
comma->column,
toString(*comma),
"type"
);
}
while (keepParsing) {
if (auto type = parseType(); ! type) {
return Unexpected<>{ std::move(type).error() };
}
args.push_back(std::move(type).value());
peekToken = tokenizer.peek();
if (! peekToken) {
return Unexpected{ std::move(peekToken).error() };
else {
args.push_back(std::move(type).value());
}
if (peekToken->value == TokenV::opComma) {
std::ignore = tokenizer.consume();
peekToken = tokenizer.peek();
if (! peekToken) {
if (auto comma = matchAndConsume(TokenV::opComma); ! comma) {
return Unexpected{ std::move(comma).error() };
}
else if (! comma.value()) {
if (peekToken = tokenizer.peek(); ! peekToken) {
return Unexpected{ std::move(peekToken).error() };
}
else {
if (peekToken->value != TokenV::opGt) {
return langException<ExceptCode::ecUnexpectedToken>(
peekToken->line,
peekToken->column,
toString(*peekToken),
"',' or '>'"
);
}
else {
keepParsing = false;
}
}
}
}

View File

@ -106,8 +106,15 @@ namespace arti::lang {
return tokensBuffer.at(n);
}
Expected<Token>
Tokenizer::peekExpect(std::size_t n, TokenV tokenType) noexcept {
Expected<Token> Tokenizer::peekExpect(
TokenV tokenType,
std::string_view message,
std::size_t n
) noexcept {
if (message.empty()) {
message = toString(tokenType);
}
if (tokensBuffer.size() > (n + 1)) {
auto tokenAt = tokensBuffer.at(n);
@ -116,7 +123,7 @@ namespace arti::lang {
tokenAt.line,
tokenAt.column,
toString(tokenAt),
toString(tokenType)
message
);
}
@ -125,7 +132,7 @@ namespace arti::lang {
auto token = peek(n);
if (!token) {
if (! token) {
return token;
}
@ -137,7 +144,7 @@ namespace arti::lang {
tokensBuffer.pop_back();
tokensBuffer.push_back(*token);
token->column += 1;
token->strValue = std::string_view{token->strValue.begin() + 1, 1};
token->strValue = std::string_view{ token->strValue.begin() + 1, 1 };
tokensBuffer.push_back(*token);
token = peekTok;
}
@ -147,7 +154,7 @@ namespace arti::lang {
token->line,
token->column,
toString(*token),
toString(tokenType)
message
);
}