feat(test, tokenizer): Add test suite, in Tokenizer fixed catched issues and range-based API

Signed-off-by: erick-alcachofa <erick@artichoke.dev>

This commit introduces a comprehensive test suite for the tokenizer
using the Catch2 framework. To support this and improve the project
structure, the build system and the tokenizer's API have been
significantly updated.

- Removed `cmake/testing.cmake` as it's no longer needed.
- A new `TokenizerRange` class provides a C++20-style range interface,
  allowing for simple `for-each` loop iteration over tokens. This is
  used extensively in the new tests.

- The CMake build system has been refactored:
    - An `ENABLE_TESTING` option (OFF by default) now controls whether
      the test suite is built.
    - The core library is now compiled into an object library, which is
      then used to produce both a shared (`.so`/`.dll`) and a static
      (`.a`/`.lib`) library. This improves build efficiency and provides
      more flexible linkage options.
    - The frontend executable now links against the static version of
      the library.

- Implemented tests for tokenizer using Catch2 framework, covering
  various cases like identifiers, keywords, numbers, etc. that already
  catched some issues in current implementation.

- Several parsing bugs and edge cases in the tokenizer were fixed,
  including the handling of unterminated strings and invalid numeric
  literals. The README has been updated with instructions for building
  and running tests.
This commit is contained in:
erick-alcachofa 2025-10-03 12:54:41 -06:00
parent 0f8688d3ee
commit e1b9e054f3
Signed by: me
GPG Key ID: 6FA5F8643444BAFA
20 changed files with 1323 additions and 40 deletions

View File

@ -15,19 +15,14 @@ set(PROJECT_URL "lang.artichoke.dev")
set(PROJECT_AUTHOR "erick-alcachofa")
set(PROJECT_AUTHOR_GITHUB "@erick-alcachofa")
include(cmake/testing.cmake)
option(ENABLE_TESTING "Enable build of tests for library" OFF)
add_subdirectory(lib)
add_subdirectory(frontend)
install(
TARGETS frontend library
EXPORT artichokeTargets
FILE_SET HEADERS
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib
TARGETS frontend
RUNTIME DESTINATION bin
INCLUDES DESTINATION include
)
get_target_property(
@ -43,6 +38,16 @@ install(
)"
)
install(
TARGETS library library_static
EXPORT artichokeTargets
FILE_SET HEADERS
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib
RUNTIME DESTINATION bin
INCLUDES DESTINATION include
)
install(
EXPORT artichokeTargets
FILE artichokeTargets.cmake
@ -62,3 +67,7 @@ install(
"${CMAKE_CURRENT_BINARY_DIR}/artichokeConfigVersion.cmake"
DESTINATION lib/cmake/artichoke
)
if(ENABLE_TESTING)
add_subdirectory(tests)
endif()

View File

@ -46,6 +46,7 @@ grammar is stable, and the next step is the implementation of a compiler
git clone https://git.artichoke.dev/me/artichoke-lang.git
# Configure cmake
# Optionally add -DENABLE_TESTING=ON for building tests
cmake -DCMAKE_BUILD_TYPE=Release -S . -B build
# Build the project
@ -54,6 +55,9 @@ cmake --build build
# Run the binary
./build/frontend/artichoke-c
# Run the tests if enabled
ctest --test-dir build/tests --output-on-failure
# Install if wanted
cmake --install build --prefix=/usr/local

View File

View File

@ -34,5 +34,5 @@ target_include_directories(
target_link_libraries(
frontend PUBLIC
library
artichoke::library_static
)

View File

@ -5,17 +5,17 @@ file(GLOB_RECURSE ARTI_LIB_HEADERS "include/**.hpp")
file(GLOB_RECURSE ARTI_LIB_GEN_HEADERS "${CMAKE_CURRENT_BINARY_DIR}/include/**.hpp")
add_library(
library SHARED
objs OBJECT
${ARTI_LIB_SOURCES}
)
set_target_properties(
library PROPERTIES
OUTPUT_NAME "artichoke"
objs PROPERTIES
POSITION_INDEPENDENT_CODE 1
)
target_compile_options(
library PRIVATE
objs PRIVATE
-pedantic
-Wall
-Wextra
@ -30,24 +30,58 @@ target_compile_options(
-Wno-unused
)
target_sources(
library PUBLIC
FILE_SET HEADERS
BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/include
FILES
${ARTI_LIB_HEADERS}
)
target_sources(
library PUBLIC
FILE_SET HEADERS
BASE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/include
FILES
${ARTI_LIB_GEN_HEADERS}
)
target_include_directories(
library PUBLIC
objs PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:>
)
add_library(
library SHARED
$<TARGET_OBJECTS:objs>
)
add_library(
artichoke::library ALIAS
library
)
set_target_properties(
library PROPERTIES
OUTPUT_NAME "artichoke"
)
add_library(
library_static STATIC
$<TARGET_OBJECTS:objs>
)
add_library(
artichoke::library_static ALIAS
library_static
)
set_target_properties(
library_static PROPERTIES
OUTPUT_NAME "artichoke"
)
set(LIB_TARGETS objs library library_static)
foreach(TGET IN LISTS LIB_TARGETS)
target_sources(
${TGET} INTERFACE
FILE_SET HEADERS
BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/include
FILES
${ARTI_LIB_HEADERS}
)
target_sources(
${TGET} INTERFACE
FILE_SET HEADERS
BASE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/include
FILES
${ARTI_LIB_GEN_HEADERS}
)
endforeach()

View File

@ -1,12 +1,12 @@
#pragma once
#include <deque>
#include <vector>
#include <artichoke/Util/Expected.hpp>
#include <artichoke/Coroutine/Generator.hpp>
#include <artichoke/Tokenizer/Token.hpp>
#include <artichoke/Tokenizer/TokenizerRange.hpp>
namespace arti::lang {
@ -30,6 +30,8 @@ namespace arti::lang {
void swap(Tokenizer &other) noexcept;
TokenizerRange range() noexcept;
private:
Generator<Expected<Token>> tokenize();

View File

@ -0,0 +1,71 @@
#pragma once
#include <artichoke/Util/Expected.hpp>
#include <artichoke/Tokenizer/Token.hpp>
namespace arti::lang {
struct Tokenizer;
struct [[nodiscard]] TokenizerRange {
friend struct Tokenizer;
struct Iterator;
struct Sentinel;
using iterator_type = Iterator;
using sentinel_type = Sentinel;
TokenizerRange(TokenizerRange &&) noexcept;
TokenizerRange &operator=(TokenizerRange &&) noexcept;
TokenizerRange(const TokenizerRange &) noexcept = delete;
TokenizerRange &operator=(const TokenizerRange &) noexcept = delete;
Iterator begin();
Sentinel end() const noexcept;
struct Iterator {
friend struct TokenizerRange;
using iterator_category = std::input_iterator_tag;
using difference_type = std::ptrdiff_t;
using ValueType = Expected<Token>;
using ReferenceType = ValueType &;
using PointerType = ValueType *;
using value_type = ValueType;
using pointer_type = PointerType;
Iterator(Iterator &&) noexcept;
Iterator &operator=(Iterator &&) noexcept;
Iterator(const Iterator &) noexcept = delete;
Iterator &operator=(const Iterator &) noexcept = delete;
Iterator &operator++();
void operator++(int);
ReferenceType operator*() const noexcept;
PointerType operator->() const noexcept;
friend bool operator==(const Iterator &, Sentinel);
friend bool operator==(Sentinel, const Iterator &);
friend bool operator!=(const Iterator &, Sentinel);
friend bool operator!=(Sentinel, const Iterator &);
private:
Iterator(Tokenizer *tokenizer) noexcept;
Tokenizer *tokenizer;
mutable Expected<Token> cvalue;
};
struct Sentinel {};
private:
TokenizerRange(Tokenizer *tokenizer);
Tokenizer *tokenizer;
};
}

View File

@ -1,8 +1,8 @@
#include <artichoke/Tokenizer/Tokenizer.hpp>
#include <print>
#include <utility>
#include <artichoke/Tokenizer/TokenizerRange.hpp>
#include <artichoke/Util/Strings.hpp>
#include <artichoke/Util/Demangle.hpp>
#include <artichoke/Util/TrieMap.hpp>
@ -36,8 +36,22 @@ namespace arti::lang {
return *this;
}
TokenizerRange Tokenizer::range() noexcept {
return TokenizerRange{ this };
}
Expected<void> Tokenizer::consume(std::size_t n) noexcept {
while (not tokensBuffer.empty()) {
while (n > 0) {
if (tokensBuffer.empty()) {
if (auto ok = peek(); ! ok) {
return Unexpected<>{ ok.error() };
}
if (finished()) {
break;
}
}
tokensBuffer.pop_front();
n -= 1;
}
@ -94,7 +108,11 @@ namespace arti::lang {
bool Tokenizer::finished() const noexcept {
if (tokensGenerator.finished()) {
return tokensBuffer.empty();
if (!tokensBuffer.empty()) {
return tokensBuffer.front().value == TokenV::tkEOF;
}
return true;
}
return false;
@ -340,6 +358,7 @@ namespace arti::lang {
}
if (*iter == '.') {
auto dotIter = iter;
forward();
while (iter != source.end()) {
@ -350,6 +369,21 @@ namespace arti::lang {
forward();
}
if ((iter - dotIter) == 1) {
/* Revert to dot */
--iter;
--column;
return langException<ExceptCode::ecInvalidLiteral>(
line,
column,
"digit",
iter == source.end()
? "EOF"
: std::string{ *(iter + 1) }
);
}
return Token{
TokenV::tkDecimal,
cLine,
@ -390,7 +424,6 @@ namespace arti::lang {
}
if (*iter == '"') {
forward();
break;
}
@ -412,12 +445,23 @@ namespace arti::lang {
forward();
}
return Token{
TokenV::tkString,
cLine,
cColumn,
{ stIter, iter }
};
if (*iter == '"') {
forward();
return Token{
TokenV::tkString,
cLine,
cColumn,
{ stIter, iter }
};
}
return langException<ExceptCode::ecInvalidLiteral>(
line,
column,
"end of string (\")",
"EOF"
);
}
Expected<Token> Tokenizer::readCharacter() {

View File

@ -0,0 +1,84 @@
#include <artichoke/Tokenizer/TokenizerRange.hpp>
#include <utility>
#include <artichoke/Tokenizer/Tokenizer.hpp>
namespace arti::lang {
using Iterator = TokenizerRange::Iterator;
using Sentinel = TokenizerRange::Sentinel;
TokenizerRange::TokenizerRange(Tokenizer *tokenizer)
: tokenizer(tokenizer) { }
TokenizerRange::TokenizerRange(TokenizerRange &&other) noexcept
: tokenizer(std::exchange(other.tokenizer, nullptr)) { }
TokenizerRange &TokenizerRange::operator=(TokenizerRange &&other) noexcept {
this->tokenizer = std::exchange(other.tokenizer, nullptr);
return *this;
}
Iterator TokenizerRange::begin() {
return Iterator{ this->tokenizer };
}
Sentinel TokenizerRange::end() const noexcept {
return Sentinel{};
}
Iterator::Iterator(Tokenizer *tokenizer) noexcept
: tokenizer(tokenizer)
, cvalue(tokenizer->peek()) { }
Iterator::Iterator(Iterator &&other) noexcept
: tokenizer(std::exchange(other.tokenizer, nullptr))
, cvalue(std::exchange(other.cvalue, {})) { }
Iterator &Iterator::operator=(Iterator &&other) noexcept {
this->tokenizer = std::exchange(other.tokenizer, nullptr);
this->cvalue = std::exchange(other.cvalue, {});
return *this;
}
Iterator &Iterator::operator++() {
if (this->cvalue) {
std::ignore = tokenizer->consume();
}
this->cvalue = tokenizer->peek();
return *this;
}
void Iterator::operator++(int) {
std::ignore = this->operator++();
}
Iterator::ReferenceType Iterator::operator*() const noexcept {
return this->cvalue;
}
Iterator::PointerType Iterator::operator->() const noexcept {
return &this->cvalue;
}
bool operator==(const Iterator &it, Sentinel) {
if (it.tokenizer->finished()) {
return true;
}
return !it.cvalue.has_value() || it.cvalue->value == TokenV::tkEOF;
}
bool operator==(Sentinel, const Iterator &it) {
return it == Sentinel{};
}
bool operator!=(const Iterator &it, Sentinel) {
return !(it == Sentinel{});
}
bool operator!=(Sentinel, const Iterator &it) {
return !(it == Sentinel{});
}
} // namespace arti::lang

20
tests/CMakeLists.txt Normal file
View File

@ -0,0 +1,20 @@
include(cmake/dependencies.cmake)
enable_testing()
add_library(test INTERFACE)
target_include_directories(
test INTERFACE
${CMAKE_CURRENT_SOURCE_DIR}/include
)
target_link_libraries(
test INTERFACE
artichoke::library
Catch2::Catch2WithMain
)
include(Catch)
add_subdirectory(Tokenizer)

View File

@ -0,0 +1,16 @@
file(GLOB_RECURSE TOKENIZER_TEST_SRC "src/**.cpp")
add_executable(
test-tokenizer
${TOKENIZER_TEST_SRC}
)
target_link_libraries(
test-tokenizer PRIVATE
test
)
catch_discover_tests(
test-tokenizer
TEST_PREFIX "Tokenizer."
)

View File

@ -0,0 +1,99 @@
#include <catch2/catch_test_macros.hpp>
#include <string>
#include <vector>
#include <artichoke/Tokenizer/Tokenizer.hpp>
namespace lang = arti::lang;
TEST_CASE("API_PeekOffset", "[api][peek]") {
const std::string source = "a b c";
lang::Tokenizer tkz{ source };
auto t0 = tkz.peek(0);
REQUIRE(t0.has_value());
REQUIRE(t0->value == lang::TokenV::tkIdentifier);
REQUIRE(t0->strValue == "a");
auto t1 = tkz.peek(1);
REQUIRE(t1.has_value());
REQUIRE(t1->value == lang::TokenV::tkIdentifier);
REQUIRE(t1->strValue == "b");
auto t2 = tkz.peek(2);
REQUIRE(t2.has_value());
REQUIRE(t2->value == lang::TokenV::tkIdentifier);
REQUIRE(t2->strValue == "c");
auto t3 = tkz.peek(3);
REQUIRE(t3.has_value());
REQUIRE(t3->value == lang::TokenV::tkEOF);
// Ensure nothing was consumed by peeks
REQUIRE_FALSE(tkz.finished());
auto t0_again = tkz.peek();
REQUIRE(t0_again.has_value());
REQUIRE(t0_again->value == lang::TokenV::tkIdentifier);
REQUIRE(t0_again->strValue == "a");
}
TEST_CASE("API_ConsumeAndFinishedSemantics", "[api][consume][finished]") {
const std::string source = "x y z";
lang::Tokenizer tkz{ source };
// consume 2 tokens: x, y
REQUIRE(tkz.consume(2).has_value());
REQUIRE_FALSE(tkz.finished());
auto t = tkz.peek();
REQUIRE(t.has_value());
REQUIRE(t->value == lang::TokenV::tkIdentifier);
REQUIRE(t->strValue == "z");
// consume last token
REQUIRE(tkz.consume().has_value());
// finished() returns true only after EOF token has been produced
REQUIRE_FALSE(tkz.finished());
auto eof = tkz.peek();
REQUIRE(eof.has_value());
REQUIRE(eof->value == lang::TokenV::tkEOF);
REQUIRE(tkz.finished());
}
TEST_CASE("API_RangeIterationStopsAtEOF", "[api][range]") {
const std::string source =
"let ident := 42 /* skip this */ + 1";
lang::Tokenizer tkz{ source };
std::vector<lang::TokenV> kinds;
std::vector<std::string_view> lex;
for (auto token : tkz.range()) {
REQUIRE(token.has_value());
kinds.push_back(token->value);
lex.push_back(token->strValue);
}
// Expected sequence: kwLet, tkIdentifier("ident"), opLabel, tkInteger("42"), opPlus, tkInteger("1")
REQUIRE(kinds.size() == 6);
REQUIRE(kinds[0] == lang::TokenV::kwLet);
REQUIRE(kinds[1] == lang::TokenV::tkIdentifier);
REQUIRE(kinds[2] == lang::TokenV::opLabel);
REQUIRE(kinds[3] == lang::TokenV::tkInteger);
REQUIRE(kinds[4] == lang::TokenV::opPlus);
REQUIRE(kinds[5] == lang::TokenV::tkInteger);
REQUIRE(lex[0] == "let");
REQUIRE(lex[1] == "ident");
REQUIRE(lex[2] == ":=");
REQUIRE(lex[3] == "42");
REQUIRE(lex[4] == "+");
REQUIRE(lex[5] == "1");
// After iterating the range, peek should yield EOF
auto eof = tkz.peek();
REQUIRE(eof.has_value());
REQUIRE(eof->value == lang::TokenV::tkEOF);
}

View File

@ -0,0 +1,87 @@
#include <catch2/catch_test_macros.hpp>
#include <string>
#include <artichoke/Tokenizer/Tokenizer.hpp>
#include <artichoke/Util/Expected.hpp>
namespace lang = arti::lang;
TEST_CASE("Comments_BlockSkipped", "[comments][block][skip]") {
// Ensure that block comments are ignored and do not emit tokens.
const std::string source = "foo /* a block comment with symbols 123 !@# */ bar";
lang::Tokenizer tkz{source};
auto t1 = tkz.peek();
REQUIRE(t1.has_value());
REQUIRE(t1->value == lang::TokenV::tkIdentifier);
REQUIRE(t1->strValue == "foo");
REQUIRE(tkz.consume().has_value());
auto t2 = tkz.peek();
REQUIRE(t2.has_value());
REQUIRE(t2->value == lang::TokenV::tkIdentifier);
REQUIRE(t2->strValue == "bar");
REQUIRE(tkz.consume().has_value());
auto eof = tkz.peek();
REQUIRE(eof.has_value());
REQUIRE(eof->value == lang::TokenV::tkEOF);
}
TEST_CASE("Comments_BlockMultiline", "[comments][block][multiline]") {
const std::string source =
"alpha /* line1\n"
"line2\n"
"line3 */ beta";
lang::Tokenizer tkz{source};
auto t1 = tkz.peek();
REQUIRE(t1.has_value());
REQUIRE(t1->value == lang::TokenV::tkIdentifier);
REQUIRE(t1->strValue == "alpha");
REQUIRE(tkz.consume().has_value());
auto t2 = tkz.peek();
REQUIRE(t2.has_value());
REQUIRE(t2->value == lang::TokenV::tkIdentifier);
REQUIRE(t2->strValue == "beta");
REQUIRE(tkz.consume().has_value());
auto eof = tkz.peek();
REQUIRE(eof.has_value());
REQUIRE(eof->value == lang::TokenV::tkEOF);
}
TEST_CASE("Comments_UnterminatedBlock_Error", "[comments][block][error]") {
// Unterminated block comments should surface an error from the tokenizer.
const std::string source = "foo /* this never ends...";
lang::Tokenizer tkz{source};
auto t1 = tkz.peek();
REQUIRE(t1.has_value());
REQUIRE(t1->value == lang::TokenV::tkIdentifier);
REQUIRE(t1->strValue == "foo");
REQUIRE(tkz.consume().has_value());
auto errTok = tkz.peek();
REQUIRE_FALSE(errTok.has_value());
// Check error message mentions invalid comment.
const auto &err = errTok.error();
REQUIRE(err.message.find("Invalid comment") != std::string::npos);
}
TEST_CASE("Comments_SingleLineUnsupported_Skip", "[comments][.line]") {
// The tokenizer currently does NOT support '//' comments.
// Keep this test as a placeholder and mark it skipped to avoid failures.
SKIP("Single-line '//' comments are not supported yet by the tokenizer");
const std::string source = "foo // comment\n bar";
lang::Tokenizer tkz{source};
(void)tkz; // silence unused
}

View File

@ -0,0 +1,127 @@
#include <catch2/catch_test_macros.hpp>
#include <array>
#include <string>
#include <vector>
#include <artichoke/Tokenizer/Tokenizer.hpp>
#include <Utils.hpp>
namespace lang = arti::lang;
template <std::size_t N>
static void CommonIdentifiersSuccess(
const std::array<std::string_view, N> &ids
) {
const std::string source = SourceFromTokens(ids);
std::size_t it = 0;
lang::Tokenizer tkz{ source };
for (auto token : tkz.range()) {
REQUIRE(token.has_value());
REQUIRE(token->value == lang::TokenV::tkIdentifier);
REQUIRE(token->strValue == ids.at(it++));
}
REQUIRE(it == ids.size());
REQUIRE(tkz.peek().has_value());
REQUIRE(tkz.peek()->value == lang::TokenV::tkEOF);
}
TEST_CASE("Identifiers_Basic", "[identifiers][valid]") {
constexpr std::array<std::string_view, 8> ids = {
"a", "abc", "a_b", "snake_case", "camelCase", "PascalCase", "_id", "with123"
};
CommonIdentifiersSuccess(ids);
}
TEST_CASE("Identifiers_DigitsAfterFirst", "[identifiers][valid]") {
constexpr std::array<std::string_view, 6> ids = {
"a1", "abc123", "_a1_b2", "v2", "x9y8z7", "i18n"
};
CommonIdentifiersSuccess(ids);
}
TEST_CASE("Identifiers_Long", "[identifiers][valid][long]") {
// Create a long identifier to ensure tokenizer handles large spans.
std::string longId(512, 'a');
std::vector<std::string_view> toks{ longId };
const std::string source = SourceFromTokens(toks);
lang::Tokenizer tkz{ source };
auto t = tkz.peek();
REQUIRE(t.has_value());
REQUIRE(t->value == lang::TokenV::tkIdentifier);
REQUIRE(t->strValue == longId);
REQUIRE(tkz.consume().has_value());
auto eof = tkz.peek();
REQUIRE(eof.has_value());
REQUIRE(eof->value == lang::TokenV::tkEOF);
}
TEST_CASE("Identifiers_WithOperators", "[identifiers][operators]") {
// '$' and '?' are operators, not identifier characters.
// '$foo' -> opMut, tkIdentifier("foo")
// '?bar' -> opOpt, tkIdentifier("bar")
const std::string source = "$foo ?bar";
lang::Tokenizer tkz{ source };
auto t1 = tkz.peek();
REQUIRE(t1.has_value());
REQUIRE(t1->value == lang::TokenV::opMut);
REQUIRE(tkz.consume().has_value());
auto t2 = tkz.peek();
REQUIRE(t2.has_value());
REQUIRE(t2->value == lang::TokenV::tkIdentifier);
REQUIRE(t2->strValue == "foo");
REQUIRE(tkz.consume().has_value());
auto t3 = tkz.peek();
REQUIRE(t3.has_value());
REQUIRE(t3->value == lang::TokenV::opOpt);
REQUIRE(tkz.consume().has_value());
auto t4 = tkz.peek();
REQUIRE(t4.has_value());
REQUIRE(t4->value == lang::TokenV::tkIdentifier);
REQUIRE(t4->strValue == "bar");
REQUIRE(tkz.consume().has_value());
auto eof = tkz.peek();
REQUIRE(eof.has_value());
REQUIRE(eof->value == lang::TokenV::tkEOF);
}
TEST_CASE("Identifiers_DotAccess", "[identifiers][dot]") {
const std::string source = "foo.bar";
lang::Tokenizer tkz{ source };
auto t1 = tkz.peek();
REQUIRE(t1.has_value());
REQUIRE(t1->value == lang::TokenV::tkIdentifier);
REQUIRE(t1->strValue == "foo");
REQUIRE(tkz.consume().has_value());
auto t2 = tkz.peek();
REQUIRE(t2.has_value());
REQUIRE(t2->value == lang::TokenV::opDot);
REQUIRE(tkz.consume().has_value());
auto t3 = tkz.peek();
REQUIRE(t3.has_value());
REQUIRE(t3->value == lang::TokenV::tkIdentifier);
REQUIRE(t3->strValue == "bar");
REQUIRE(tkz.consume().has_value());
auto eof = tkz.peek();
REQUIRE(eof.has_value());
REQUIRE(eof->value == lang::TokenV::tkEOF);
}

View File

@ -0,0 +1,93 @@
#include <catch2/catch_test_macros.hpp>
#include <array>
#include <artichoke/Tokenizer/Tokenizer.hpp>
#include <Utils.hpp>
namespace lang = arti::lang;
template <std::size_t N>
static void CommonKeywordsSuccess(
const std::array<std::string_view, N> &lexemes,
const std::array<lang::TokenV, N> &kinds
) {
static_assert(N > 0, "Must provide at least one keyword");
const std::string source = SourceFromTokens(lexemes);
std::size_t it = 0;
lang::Tokenizer tkz{ source };
for (auto token : tkz.range()) {
REQUIRE(token.has_value());
REQUIRE(token->value == kinds.at(it));
REQUIRE(token->strValue == lexemes.at(it));
++it;
}
REQUIRE(it == lexemes.size());
REQUIRE(tkz.peek().has_value());
REQUIRE(tkz.peek()->value == lang::TokenV::tkEOF);
}
TEST_CASE("Keywords_AllRecognized", "[keywords][valid]") {
constexpr std::array<std::string_view, 31> lexemes = {
"_","or","not","and","if","else","fn","enum","struct","def","let","for",
"loop","break","continue","while","match","switch","return","unreachable",
"defer","errdefer","true","false","null","this","import","export","module","using","this"
};
constexpr std::array<lang::TokenV, 31> kinds = {
lang::TokenV::kwUnderscore,
lang::TokenV::kwOr,
lang::TokenV::kwNot,
lang::TokenV::kwAnd,
lang::TokenV::kwIf,
lang::TokenV::kwElse,
lang::TokenV::kwFn,
lang::TokenV::kwEnum,
lang::TokenV::kwStruct,
lang::TokenV::kwDef,
lang::TokenV::kwLet,
lang::TokenV::kwFor,
lang::TokenV::kwLoop,
lang::TokenV::kwBreak,
lang::TokenV::kwContinue,
lang::TokenV::kwWhile,
lang::TokenV::kwMatch,
lang::TokenV::kwSwitch,
lang::TokenV::kwReturn,
lang::TokenV::kwUnreachable,
lang::TokenV::kwDefer,
lang::TokenV::kwErrDefer,
lang::TokenV::kwTrue,
lang::TokenV::kwFalse,
lang::TokenV::kwNull,
lang::TokenV::kwThis,
lang::TokenV::kwImport,
lang::TokenV::kwExport,
lang::TokenV::kwModule,
lang::TokenV::kwUsing,
lang::TokenV::kwThis,
};
CommonKeywordsSuccess(lexemes, kinds);
}
TEST_CASE("Keywords_PrecedenceOverIdentifiers", "[keywords][precedence]") {
// Ensure that keywords are recognized as keywords, while longer names remain identifiers.
constexpr std::array<std::string_view, 6> lexemes = {
"if", "iff", "return", "returnX", "_", "_id"
};
constexpr std::array<lang::TokenV, 6> kinds = {
lang::TokenV::kwIf, // "if" is a keyword
lang::TokenV::tkIdentifier,// "iff" should be an identifier
lang::TokenV::kwReturn, // "return" is a keyword
lang::TokenV::tkIdentifier,// "returnX" is not a keyword
lang::TokenV::kwUnderscore,// "_" is a keyword in this language
lang::TokenV::tkIdentifier // "_id" is a regular identifier
};
CommonKeywordsSuccess(lexemes, kinds);
}

View File

@ -0,0 +1,171 @@
#include <catch2/catch_test_macros.hpp>
#include <array>
#include <artichoke/Tokenizer/Tokenizer.hpp>
#include <Utils.hpp>
namespace lang = arti::lang;
template <std::size_t N>
static void CommonIntegersSuccess(
lang::TokenV type,
const std::array<std::string_view, N> &expected
) {
const std::string source = SourceFromTokens(expected);
std::size_t it = 0;
lang::Tokenizer tkz{ source };
for (auto token : tkz.range()) {
REQUIRE(token.has_value());
REQUIRE(token->value == type);
REQUIRE(token->strValue == expected.at(it++));
}
REQUIRE(it == expected.size());
REQUIRE(tkz.peek().has_value());
REQUIRE(tkz.peek()->value == lang::TokenV::tkEOF);
}
TEST_CASE("Numbers_Integers", "[integers][valid]") {
constexpr std::array<std::string_view, 6> expected = {
"0", "1", "42", "123456", "98712390", "12381723912465471"
};
CommonIntegersSuccess(lang::TokenV::tkInteger, expected);
}
TEST_CASE("Numbers_HexIntegers", "[integers][valid]") {
constexpr std::array<std::string_view, 6> expected = {
"0x831", "0xAFEFE", "0xABEBE",
"0x7a147e8a3", "0x98712390", "0x1d238c18e7ff239a12465471"
};
CommonIntegersSuccess(lang::TokenV::tkInteger, expected);
}
TEST_CASE("Numbers_OctIntegers", "[integers][valid]") {
constexpr std::array<std::string_view, 6> expected = {
"041", "064123", "0136237", "012345", "01", "071236571236512631723651"
};
CommonIntegersSuccess(lang::TokenV::tkInteger, expected);
}
TEST_CASE("Numbers_BinIntegers", "[integers][valid]") {
constexpr std::array<std::string_view, 6> expected = {
"0b0101101", "0b1", "0b01", "0b0", "0b011010101110101101110101011",
"0b11110101011010101"
};
CommonIntegersSuccess(lang::TokenV::tkInteger, expected);
}
TEST_CASE("Numbers_Decimal", "[decimals][valid]") {
constexpr std::array<std::string_view, 6> expected = {
"1.0", "0.5", "3.14159", "10.50", "9999.0001", "1375123476175981.813751235"
};
CommonIntegersSuccess(lang::TokenV::tkDecimal, expected);
}
TEST_CASE("Numbers_UnaryMinusSeparate", "[numbers][unary-minus]") {
const std::string source = "-1 -2.5";
lang::Tokenizer tkz{ source };
auto t1 = tkz.peek();
REQUIRE(t1.has_value());
REQUIRE(t1->value == lang::TokenV::opHyphen);
REQUIRE(tkz.consume().has_value());
auto t2 = tkz.peek();
REQUIRE(t2.has_value());
REQUIRE(t2->value == lang::TokenV::tkInteger);
REQUIRE(t2->strValue == "1");
REQUIRE(tkz.consume().has_value());
auto t3 = tkz.peek();
REQUIRE(t3.has_value());
REQUIRE(t3->value == lang::TokenV::opHyphen);
REQUIRE(tkz.consume().has_value());
auto t4 = tkz.peek();
REQUIRE(t4.has_value());
REQUIRE(t4->value == lang::TokenV::tkDecimal);
REQUIRE(t4->strValue == "2.5");
REQUIRE(tkz.consume().has_value());
auto eof = tkz.peek();
REQUIRE(eof.has_value());
REQUIRE(eof->value == lang::TokenV::tkEOF);
}
TEST_CASE("Numbers_DotBoundaries_Disambiguation", "[numbers][dot][edge]") {
// Expect numbers must start with a digit:
// .5 -> '.' + '5'
// 10. -> ERROR (expects digit after '.'), then '.' token if continued
// 1..2 -> ERROR (expects digit after '.'), then '.' '.' '2' if continued
const std::string source = ".5 10. 1..2";
lang::Tokenizer tkz{ source };
// .5 -> '.' then '5'
auto a1 = tkz.peek();
REQUIRE(a1.has_value());
REQUIRE(a1->value == lang::TokenV::opDot);
REQUIRE(tkz.consume().has_value());
auto a2 = tkz.peek();
REQUIRE(a2.has_value());
REQUIRE(a2->value == lang::TokenV::tkInteger);
REQUIRE(a2->strValue == "5");
REQUIRE(tkz.consume().has_value());
// 10. -> first an error (expects a digit after '.'), then '.' is parsed if we continue
auto err1 = tkz.peek();
REQUIRE_FALSE(err1.has_value());
auto after_err1 = tkz.peek();
REQUIRE(after_err1.has_value());
REQUIRE(after_err1->value == lang::TokenV::opDot);
REQUIRE(tkz.consume().has_value());
// 1..2 -> first an error (expects a digit after '.'), then '.' '.' '2'
auto err2 = tkz.peek();
REQUIRE_FALSE(err2.has_value());
auto dot1 = tkz.peek();
REQUIRE(dot1.has_value());
REQUIRE(dot1->value == lang::TokenV::opDot);
REQUIRE(tkz.consume().has_value());
auto dot2 = tkz.peek();
REQUIRE(dot2.has_value());
REQUIRE(dot2->value == lang::TokenV::opDot);
REQUIRE(tkz.consume().has_value());
auto last = tkz.peek();
REQUIRE(last.has_value());
REQUIRE(last->value == lang::TokenV::tkInteger);
REQUIRE(last->strValue == "2");
REQUIRE(tkz.consume().has_value());
auto eof = tkz.peek();
REQUIRE(eof.has_value());
REQUIRE(eof->value == lang::TokenV::tkEOF);
}
TEST_CASE("Numbers_InvalidPrefixes", "[numbers][invalid]") {
const std::array<const char*, 5> invalids = { "0x", "0b", "0xG", "0b2", "08" };
for (auto src : invalids) {
lang::Tokenizer tkz{ std::string{src} };
auto tok = tkz.peek();
REQUIRE_FALSE(tok.has_value());
const auto &err = tok.error();
REQUIRE(
err.message.find("Invalid literal") != std::string::npos
);
}
}

View File

@ -0,0 +1,178 @@
#include <catch2/catch_test_macros.hpp>
#include <array>
#include <string>
#include <vector>
#include <artichoke/Tokenizer/Tokenizer.hpp>
#include <Utils.hpp>
namespace lang = arti::lang;
template <std::size_t N>
static void CommonOpsSuccess(
const std::array<std::string_view, N> &lexemes,
const std::array<lang::TokenV, N> &kinds
) {
static_assert(N > 0, "Must provide at least one operator");
const std::string source = SourceFromTokens(lexemes);
std::size_t it = 0;
lang::Tokenizer tkz{ source };
for (auto token : tkz.range()) {
REQUIRE(token.has_value());
REQUIRE(token->value == kinds.at(it));
REQUIRE(token->strValue == lexemes.at(it));
++it;
}
REQUIRE(it == lexemes.size());
REQUIRE(tkz.peek().has_value());
REQUIRE(tkz.peek()->value == lang::TokenV::tkEOF);
}
TEST_CASE("Operators_SingleChar", "[operators][single]") {
constexpr std::array<std::string_view, 25> lex = {
".", "%", "+", "-", "/", "!", "*", ":", ",", "=", ";", "^", "~",
"&", "|", "<", ">", "(", ")", "[", "]", "{", "}", "?", "$"
};
constexpr std::array<lang::TokenV, 25> kinds = {
lang::TokenV::opDot, lang::TokenV::opMod, lang::TokenV::opPlus, lang::TokenV::opHyphen,
lang::TokenV::opSlash, lang::TokenV::opBang, lang::TokenV::opStar, lang::TokenV::opColon,
lang::TokenV::opComma, lang::TokenV::opAssign, lang::TokenV::opSemicolon, lang::TokenV::opCaret,
lang::TokenV::opTilde, lang::TokenV::opAnd, lang::TokenV::opOr, lang::TokenV::opLt,
lang::TokenV::opGt, lang::TokenV::opLParen, lang::TokenV::opRParen, lang::TokenV::opLBracket,
lang::TokenV::opRBracket, lang::TokenV::opLSquirly, lang::TokenV::opRSquirly, lang::TokenV::opOpt,
lang::TokenV::opMut
};
CommonOpsSuccess(lex, kinds);
}
TEST_CASE("Operators_MultiChar", "[operators][multi]") {
constexpr std::array<std::string_view, 22> lex = {
"==","!=", "<=", ">=", "<<", ">>", "&&", "||",
"+=", "-=", "*=", "/=", "%=", "&=", "|=",
"<<=", ">>=", "&&=", "||=", "->", "::", ":="
};
constexpr std::array<lang::TokenV, 22> kinds = {
lang::TokenV::opEq, lang::TokenV::opNeq, lang::TokenV::opLtEq, lang::TokenV::opGtEq,
lang::TokenV::opLShift, lang::TokenV::opRShift, lang::TokenV::opBoolAnd, lang::TokenV::opBoolOr,
lang::TokenV::opPlusAssign, lang::TokenV::opHyphenAssign, lang::TokenV::opStarAssign, lang::TokenV::opSlashAssign,
lang::TokenV::opModAssign, lang::TokenV::opAndAssign, lang::TokenV::opOrAssign,
lang::TokenV::opLShiftAssign, lang::TokenV::opRShiftAssign, lang::TokenV::opBoolAndAssign, lang::TokenV::opBoolORAssign,
lang::TokenV::opArrow, lang::TokenV::opAccess, lang::TokenV::opLabel
};
CommonOpsSuccess(lex, kinds);
}
TEST_CASE("Operators_DotPrefixedSpecials", "[operators][dot][special]") {
constexpr std::array<std::string_view, 4> lex = {
".#", ".[", ".*", ".@"
};
constexpr std::array<lang::TokenV, 4> kinds = {
lang::TokenV::opSliceSize, lang::TokenV::opPtrSlice, lang::TokenV::opSlicePtr, lang::TokenV::opReflect
};
CommonOpsSuccess(lex, kinds);
}
TEST_CASE("Operators_GreedyLongestMatch", "[operators][greedy]") {
// Ensure longest valid operator is selected.
constexpr std::array<std::string_view, 8> lex = {
">>=", "<<=", "&&=", "||=",
">=", "<=", "->", "::"
};
constexpr std::array<lang::TokenV, 8> kinds = {
lang::TokenV::opRShiftAssign, lang::TokenV::opLShiftAssign, lang::TokenV::opBoolAndAssign, lang::TokenV::opBoolORAssign,
lang::TokenV::opGtEq, lang::TokenV::opLtEq, lang::TokenV::opArrow, lang::TokenV::opAccess
};
CommonOpsSuccess(lex, kinds);
}
TEST_CASE("Operators_BoundariesWhitespace", "[operators][boundaries]") {
// '= =' should not be '=='
const std::string source = "=\n=";
lang::Tokenizer tkz{ source };
auto t1 = tkz.peek();
REQUIRE(t1.has_value());
REQUIRE(t1->value == lang::TokenV::opAssign);
REQUIRE(tkz.consume().has_value());
auto t2 = tkz.peek();
REQUIRE(t2.has_value());
REQUIRE(t2->value == lang::TokenV::opAssign);
REQUIRE(tkz.consume().has_value());
auto eof = tkz.peek();
REQUIRE(eof.has_value());
REQUIRE(eof->value == lang::TokenV::tkEOF);
}
TEST_CASE("Operators_ContextAccessLabelArrow", "[operators][context]") {
const std::string source = "ns::name := src->field";
lang::Tokenizer tkz{ source };
auto t1 = tkz.peek();
REQUIRE(t1.has_value());
REQUIRE(t1->value == lang::TokenV::tkIdentifier);
REQUIRE(t1->strValue == "ns");
REQUIRE(tkz.consume().has_value());
auto t2 = tkz.peek();
REQUIRE(t2.has_value());
REQUIRE(t2->value == lang::TokenV::opAccess);
REQUIRE(t2->strValue == "::");
REQUIRE(tkz.consume().has_value());
auto t3 = tkz.peek();
REQUIRE(t3.has_value());
REQUIRE(t3->value == lang::TokenV::tkIdentifier);
REQUIRE(t3->strValue == "name");
REQUIRE(tkz.consume().has_value());
auto t4 = tkz.peek();
REQUIRE(t4.has_value());
REQUIRE(t4->value == lang::TokenV::opLabel);
REQUIRE(t4->strValue == ":=");
REQUIRE(tkz.consume().has_value());
auto t5 = tkz.peek();
REQUIRE(t5.has_value());
REQUIRE(t5->value == lang::TokenV::tkIdentifier);
REQUIRE(t5->strValue == "src");
REQUIRE(tkz.consume().has_value());
auto t6 = tkz.peek();
REQUIRE(t6.has_value());
REQUIRE(t6->value == lang::TokenV::opArrow);
REQUIRE(t6->strValue == "->");
REQUIRE(tkz.consume().has_value());
auto t7 = tkz.peek();
REQUIRE(t7.has_value());
REQUIRE(t7->value == lang::TokenV::tkIdentifier);
REQUIRE(t7->strValue == "field");
REQUIRE(tkz.consume().has_value());
auto eof = tkz.peek();
REQUIRE(eof.has_value());
REQUIRE(eof->value == lang::TokenV::tkEOF);
}
TEST_CASE("Operators_InvalidStandalone_Error", "[operators][error]") {
// '#' and '@' alone are not valid tokens (only .# and .@ are valid).
const std::vector<std::string> invalids = { "#", "@", "`" };
for (const auto &src : invalids) {
lang::Tokenizer tkz{ src };
auto tok = tkz.peek();
REQUIRE_FALSE(tok.has_value());
const auto &err = tok.error();
REQUIRE(err.message.find("Invalid") != std::string::npos);
}
}

View File

@ -0,0 +1,168 @@
#include <catch2/catch_test_macros.hpp>
#include <array>
#include <string>
#include <artichoke/Tokenizer/Tokenizer.hpp>
#include <artichoke/Util/Expected.hpp>
#include <Utils.hpp>
namespace lang = arti::lang;
template <std::size_t N>
static void CommonLiteralsSuccess(
lang::TokenV kind,
const std::array<std::string_view, N> &lexemes
) {
const std::string source = SourceFromTokens(lexemes);
std::size_t it = 0;
lang::Tokenizer tkz{ source };
for (auto token : tkz.range()) {
REQUIRE(token.has_value());
REQUIRE(token->value == kind);
REQUIRE(token->strValue == lexemes.at(it++));
}
REQUIRE(it == lexemes.size());
REQUIRE(tkz.peek().has_value());
REQUIRE(tkz.peek()->value == lang::TokenV::tkEOF);
}
TEST_CASE("Strings_Simple", "[strings][valid]") {
constexpr std::array<std::string_view, 5> lexemes = {
R"("a")",
R"("hello")",
R"("with spaces")",
R"("12345")",
R"Q("!@#$%^&*()")Q"
};
CommonLiteralsSuccess(lang::TokenV::tkString, lexemes);
}
TEST_CASE("Strings_Escapes", "[strings][valid][escapes]") {
// Validate common escape sequences remain part of lexeme text.
constexpr std::array<std::string_view, 5> lexemes = {
R"("quote: \"")",
R"("backslash: \\")",
R"("newline: \n")",
R"("tab: \t")",
R"("mix: \"\\\n\t")"
};
CommonLiteralsSuccess(lang::TokenV::tkString, lexemes);
}
TEST_CASE("Strings_OperatorsAdjacency", "[strings][operators]") {
// "foo"+"bar" -> tkString, opPlus, tkString
const std::string source = R"("foo"+"bar")";
lang::Tokenizer tkz{ source };
auto t1 = tkz.peek();
REQUIRE(t1.has_value());
REQUIRE(t1->value == lang::TokenV::tkString);
REQUIRE(t1->strValue == R"("foo")");
REQUIRE(tkz.consume().has_value());
auto t2 = tkz.peek();
REQUIRE(t2.has_value());
REQUIRE(t2->value == lang::TokenV::opPlus);
REQUIRE(tkz.consume().has_value());
auto t3 = tkz.peek();
REQUIRE(t3.has_value());
REQUIRE(t3->value == lang::TokenV::tkString);
REQUIRE(t3->strValue == R"("bar")");
REQUIRE(tkz.consume().has_value());
auto eof = tkz.peek();
REQUIRE(eof.has_value());
REQUIRE(eof->value == lang::TokenV::tkEOF);
}
TEST_CASE("Strings_Unterminated_Error", "[strings][error]") {
// Missing closing quote should yield an error.
const std::string source = "\"unterminated";
lang::Tokenizer tkz{ source };
auto errTok = tkz.peek();
REQUIRE_FALSE(errTok.has_value());
const auto &err = errTok.error();
REQUIRE(
err.message.find("Invalid literal") != std::string::npos
);
}
TEST_CASE("Chars_Simple", "[chars][valid]") {
constexpr std::array<std::string_view, 5> lexemes = {
R"('a')",
R"('Z')",
R"('0')",
R"('_')",
R"('$')"
};
CommonLiteralsSuccess(lang::TokenV::tkCharacter, lexemes);
}
TEST_CASE("Chars_Escapes", "[chars][valid][escapes]") {
constexpr std::array<std::string_view, 4> lexemes = {
R"('\n')",
R"('\t')",
R"('\\')",
R"('\'')"
};
CommonLiteralsSuccess(lang::TokenV::tkCharacter, lexemes);
}
TEST_CASE("Chars_Invalid_Empty", "[chars][error]") {
const std::string source = "''";
lang::Tokenizer tkz{ source };
auto errTok = tkz.peek();
REQUIRE_FALSE(errTok.has_value());
const auto &err = errTok.error();
REQUIRE(
err.message.find("Invalid literal") != std::string::npos
);
}
TEST_CASE("Chars_Invalid_Multiple", "[chars][error]") {
const std::string source = "'ab'";
lang::Tokenizer tkz{ source };
auto errTok = tkz.peek();
REQUIRE_FALSE(errTok.has_value());
const auto &err = errTok.error();
REQUIRE(
err.message.find("Invalid literal") != std::string::npos
);
}
TEST_CASE("Chars_Unterminated", "[chars][error]") {
const std::string source = "'a";
lang::Tokenizer tkz{ source };
auto errTok = tkz.peek();
REQUIRE_FALSE(errTok.has_value());
const auto &err = errTok.error();
REQUIRE(
err.message.find("Invalid literal") != std::string::npos
);
}
TEST_CASE("Chars_InvalidEscape", "[chars][error][.escapes]") {
const std::string source = "'\\x'";
lang::Tokenizer tkz{ source };
auto errTok = tkz.peek();
REQUIRE_FALSE(errTok.has_value());
const auto &err = errTok.error();
REQUIRE(
err.message.find("Invalid literal") != std::string::npos
);
}

View File

@ -0,0 +1,18 @@
include(FetchContent)
# Get CPM
file(
DOWNLOAD
https://github.com/cpm-cmake/CPM.cmake/releases/download/v0.40.8/CPM.cmake
${CMAKE_CURRENT_BINARY_DIR}/cmake/CPM.cmake
EXPECTED_HASH
SHA256=78ba32abdf798bc616bab7c73aac32a17bbd7b06ad9e26a6add69de8f3ae4791
)
include(${CMAKE_CURRENT_BINARY_DIR}/cmake/CPM.cmake)
# Get dependencies
CPMAddPackage("gh:catchorg/Catch2@3.6.0")
# Include Catch2 CMake scripts
list(APPEND CMAKE_MODULE_PATH ${Catch2_SOURCE_DIR}/extras)

58
tests/include/Utils.hpp Normal file
View File

@ -0,0 +1,58 @@
#pragma once
#include <random>
#include <artichoke/Coroutine/Generator.hpp>
template <std::ranges::range R1, std::ranges::range R2>
requires(
std::is_convertible_v<std::ranges::range_value_t<R1>, std::string_view> and
std::is_same_v<std::ranges::range_value_t<R1>, std::ranges::range_value_t<R2>>
)
arti::lang::Generator<std::ranges::range_value_t<R1>>
InterleaveRanges(R1 &&r1, R2 &&r2) {
auto it1 = std::ranges::begin(r1);
auto end1 = std::ranges::end(r1);
auto it2 = std::ranges::begin(r2);
auto end2 = std::ranges::end(r2);
while (it1 != end1 && it2 != end2) {
yield *it1;
++it1;
yield *it2;
++it2;
}
}
static arti::lang::Generator<std::string_view>
WhitespaceGenerator(uint32_t maxLen = 5) {
constexpr std::array<char, 3> spaceChars{ ' ', '\t', '\n' };
std::string str;
std::random_device device;
std::mt19937 engine(device());
std::uniform_int_distribution<uint32_t> dist(1, maxLen);
std::uniform_int_distribution<uint32_t> distChars(0, 2);
str.reserve(maxLen);
while (true) {
str.resize(0);
auto sz = dist(engine);
for (uint32_t i = 0; i < sz; ++i) {
str += spaceChars[distChars(engine)];
}
yield str;
}
}
template <std::ranges::range R>
requires(std::is_same_v<std::ranges::range_value_t<R>, std::string_view>)
static std::string SourceFromTokens(R &&tokens) {
return InterleaveRanges(tokens, WhitespaceGenerator(10)) | std::views::join |
std::ranges::to<std::string>();
}