From e1b9e054f31409e86740d2e6118dda866f657a66 Mon Sep 17 00:00:00 2001 From: erick-alcachofa Date: Fri, 3 Oct 2025 12:54:41 -0600 Subject: [PATCH] feat(test, tokenizer): Add test suite, in Tokenizer fixed catched issues and range-based API Signed-off-by: erick-alcachofa This commit introduces a comprehensive test suite for the tokenizer using the Catch2 framework. To support this and improve the project structure, the build system and the tokenizer's API have been significantly updated. - Removed `cmake/testing.cmake` as it's no longer needed. - A new `TokenizerRange` class provides a C++20-style range interface, allowing for simple `for-each` loop iteration over tokens. This is used extensively in the new tests. - The CMake build system has been refactored: - An `ENABLE_TESTING` option (OFF by default) now controls whether the test suite is built. - The core library is now compiled into an object library, which is then used to produce both a shared (`.so`/`.dll`) and a static (`.a`/`.lib`) library. This improves build efficiency and provides more flexible linkage options. - The frontend executable now links against the static version of the library. - Implemented tests for tokenizer using Catch2 framework, covering various cases like identifiers, keywords, numbers, etc. that already catched some issues in current implementation. - Several parsing bugs and edge cases in the tokenizer were fixed, including the handling of unterminated strings and invalid numeric literals. The README has been updated with instructions for building and running tests. --- CMakeLists.txt | 23 ++- README.md | 4 + cmake/testing.cmake | 0 frontend/CMakeLists.txt | 2 +- lib/CMakeLists.txt | 76 +++++--- lib/include/artichoke/Tokenizer/Tokenizer.hpp | 4 +- .../artichoke/Tokenizer/TokenizerRange.hpp | 71 +++++++ lib/src/Tokenizer/Tokenizer.cpp | 64 ++++++- lib/src/Tokenizer/TokenizerRange.cpp | 84 +++++++++ tests/CMakeLists.txt | 20 ++ tests/Tokenizer/CMakeLists.txt | 16 ++ tests/Tokenizer/src/Api.cpp | 99 ++++++++++ tests/Tokenizer/src/Comments.cpp | 87 +++++++++ tests/Tokenizer/src/Identifiers.cpp | 127 +++++++++++++ tests/Tokenizer/src/Keywords.cpp | 93 +++++++++ tests/Tokenizer/src/Numbers.cpp | 171 +++++++++++++++++ tests/Tokenizer/src/Operators.cpp | 178 ++++++++++++++++++ tests/Tokenizer/src/Strings.cpp | 168 +++++++++++++++++ tests/cmake/dependencies.cmake | 18 ++ tests/include/Utils.hpp | 58 ++++++ 20 files changed, 1323 insertions(+), 40 deletions(-) delete mode 100644 cmake/testing.cmake create mode 100644 lib/include/artichoke/Tokenizer/TokenizerRange.hpp create mode 100644 lib/src/Tokenizer/TokenizerRange.cpp create mode 100644 tests/CMakeLists.txt create mode 100644 tests/Tokenizer/CMakeLists.txt create mode 100644 tests/Tokenizer/src/Api.cpp create mode 100644 tests/Tokenizer/src/Comments.cpp create mode 100644 tests/Tokenizer/src/Identifiers.cpp create mode 100644 tests/Tokenizer/src/Keywords.cpp create mode 100644 tests/Tokenizer/src/Numbers.cpp create mode 100644 tests/Tokenizer/src/Operators.cpp create mode 100644 tests/Tokenizer/src/Strings.cpp create mode 100644 tests/cmake/dependencies.cmake create mode 100644 tests/include/Utils.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 1239d3a..e47e6f1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,19 +15,14 @@ set(PROJECT_URL "lang.artichoke.dev") set(PROJECT_AUTHOR "erick-alcachofa") set(PROJECT_AUTHOR_GITHUB "@erick-alcachofa") -include(cmake/testing.cmake) +option(ENABLE_TESTING "Enable build of tests for library" OFF) add_subdirectory(lib) add_subdirectory(frontend) install( - TARGETS frontend library - EXPORT artichokeTargets - FILE_SET HEADERS - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib + TARGETS frontend RUNTIME DESTINATION bin - INCLUDES DESTINATION include ) get_target_property( @@ -43,6 +38,16 @@ install( )" ) +install( + TARGETS library library_static + EXPORT artichokeTargets + FILE_SET HEADERS + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib + RUNTIME DESTINATION bin + INCLUDES DESTINATION include +) + install( EXPORT artichokeTargets FILE artichokeTargets.cmake @@ -62,3 +67,7 @@ install( "${CMAKE_CURRENT_BINARY_DIR}/artichokeConfigVersion.cmake" DESTINATION lib/cmake/artichoke ) + +if(ENABLE_TESTING) + add_subdirectory(tests) +endif() diff --git a/README.md b/README.md index a4e5e44..a5d9dfd 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,7 @@ grammar is stable, and the next step is the implementation of a compiler git clone https://git.artichoke.dev/me/artichoke-lang.git # Configure cmake +# Optionally add -DENABLE_TESTING=ON for building tests cmake -DCMAKE_BUILD_TYPE=Release -S . -B build # Build the project @@ -54,6 +55,9 @@ cmake --build build # Run the binary ./build/frontend/artichoke-c +# Run the tests if enabled +ctest --test-dir build/tests --output-on-failure + # Install if wanted cmake --install build --prefix=/usr/local diff --git a/cmake/testing.cmake b/cmake/testing.cmake deleted file mode 100644 index e69de29..0000000 diff --git a/frontend/CMakeLists.txt b/frontend/CMakeLists.txt index 2bacfdb..14091df 100644 --- a/frontend/CMakeLists.txt +++ b/frontend/CMakeLists.txt @@ -34,5 +34,5 @@ target_include_directories( target_link_libraries( frontend PUBLIC - library + artichoke::library_static ) diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index a29bf00..938a2c4 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -5,17 +5,17 @@ file(GLOB_RECURSE ARTI_LIB_HEADERS "include/**.hpp") file(GLOB_RECURSE ARTI_LIB_GEN_HEADERS "${CMAKE_CURRENT_BINARY_DIR}/include/**.hpp") add_library( - library SHARED + objs OBJECT ${ARTI_LIB_SOURCES} ) set_target_properties( - library PROPERTIES - OUTPUT_NAME "artichoke" + objs PROPERTIES + POSITION_INDEPENDENT_CODE 1 ) target_compile_options( - library PRIVATE + objs PRIVATE -pedantic -Wall -Wextra @@ -30,24 +30,58 @@ target_compile_options( -Wno-unused ) -target_sources( - library PUBLIC - FILE_SET HEADERS - BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/include - FILES - ${ARTI_LIB_HEADERS} -) - -target_sources( - library PUBLIC - FILE_SET HEADERS - BASE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/include - FILES - ${ARTI_LIB_GEN_HEADERS} -) - target_include_directories( - library PUBLIC + objs PUBLIC $ $ ) + +add_library( + library SHARED + $ +) + +add_library( + artichoke::library ALIAS + library +) + +set_target_properties( + library PROPERTIES + OUTPUT_NAME "artichoke" +) + +add_library( + library_static STATIC + $ +) + +add_library( + artichoke::library_static ALIAS + library_static +) + +set_target_properties( + library_static PROPERTIES + OUTPUT_NAME "artichoke" +) + +set(LIB_TARGETS objs library library_static) + +foreach(TGET IN LISTS LIB_TARGETS) + target_sources( + ${TGET} INTERFACE + FILE_SET HEADERS + BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/include + FILES + ${ARTI_LIB_HEADERS} + ) + + target_sources( + ${TGET} INTERFACE + FILE_SET HEADERS + BASE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/include + FILES + ${ARTI_LIB_GEN_HEADERS} + ) +endforeach() diff --git a/lib/include/artichoke/Tokenizer/Tokenizer.hpp b/lib/include/artichoke/Tokenizer/Tokenizer.hpp index 3ea09a5..7269d57 100644 --- a/lib/include/artichoke/Tokenizer/Tokenizer.hpp +++ b/lib/include/artichoke/Tokenizer/Tokenizer.hpp @@ -1,12 +1,12 @@ #pragma once #include -#include #include #include #include +#include namespace arti::lang { @@ -30,6 +30,8 @@ namespace arti::lang { void swap(Tokenizer &other) noexcept; + TokenizerRange range() noexcept; + private: Generator> tokenize(); diff --git a/lib/include/artichoke/Tokenizer/TokenizerRange.hpp b/lib/include/artichoke/Tokenizer/TokenizerRange.hpp new file mode 100644 index 0000000..63891d5 --- /dev/null +++ b/lib/include/artichoke/Tokenizer/TokenizerRange.hpp @@ -0,0 +1,71 @@ +#pragma once + +#include +#include + +namespace arti::lang { + struct Tokenizer; + + struct [[nodiscard]] TokenizerRange { + friend struct Tokenizer; + struct Iterator; + struct Sentinel; + + using iterator_type = Iterator; + using sentinel_type = Sentinel; + + TokenizerRange(TokenizerRange &&) noexcept; + TokenizerRange &operator=(TokenizerRange &&) noexcept; + + TokenizerRange(const TokenizerRange &) noexcept = delete; + TokenizerRange &operator=(const TokenizerRange &) noexcept = delete; + + Iterator begin(); + Sentinel end() const noexcept; + + struct Iterator { + friend struct TokenizerRange; + using iterator_category = std::input_iterator_tag; + using difference_type = std::ptrdiff_t; + + using ValueType = Expected; + using ReferenceType = ValueType &; + using PointerType = ValueType *; + + using value_type = ValueType; + using pointer_type = PointerType; + + Iterator(Iterator &&) noexcept; + Iterator &operator=(Iterator &&) noexcept; + + Iterator(const Iterator &) noexcept = delete; + Iterator &operator=(const Iterator &) noexcept = delete; + + Iterator &operator++(); + + void operator++(int); + + ReferenceType operator*() const noexcept; + PointerType operator->() const noexcept; + + friend bool operator==(const Iterator &, Sentinel); + friend bool operator==(Sentinel, const Iterator &); + + friend bool operator!=(const Iterator &, Sentinel); + friend bool operator!=(Sentinel, const Iterator &); + + private: + Iterator(Tokenizer *tokenizer) noexcept; + + Tokenizer *tokenizer; + mutable Expected cvalue; + }; + + struct Sentinel {}; + + private: + TokenizerRange(Tokenizer *tokenizer); + + Tokenizer *tokenizer; + }; +} diff --git a/lib/src/Tokenizer/Tokenizer.cpp b/lib/src/Tokenizer/Tokenizer.cpp index 09f0fec..c74574c 100644 --- a/lib/src/Tokenizer/Tokenizer.cpp +++ b/lib/src/Tokenizer/Tokenizer.cpp @@ -1,8 +1,8 @@ #include -#include #include +#include #include #include #include @@ -36,8 +36,22 @@ namespace arti::lang { return *this; } + TokenizerRange Tokenizer::range() noexcept { + return TokenizerRange{ this }; + } + Expected Tokenizer::consume(std::size_t n) noexcept { - while (not tokensBuffer.empty()) { + while (n > 0) { + if (tokensBuffer.empty()) { + if (auto ok = peek(); ! ok) { + return Unexpected<>{ ok.error() }; + } + + if (finished()) { + break; + } + } + tokensBuffer.pop_front(); n -= 1; } @@ -94,7 +108,11 @@ namespace arti::lang { bool Tokenizer::finished() const noexcept { if (tokensGenerator.finished()) { - return tokensBuffer.empty(); + if (!tokensBuffer.empty()) { + return tokensBuffer.front().value == TokenV::tkEOF; + } + + return true; } return false; @@ -340,6 +358,7 @@ namespace arti::lang { } if (*iter == '.') { + auto dotIter = iter; forward(); while (iter != source.end()) { @@ -350,6 +369,21 @@ namespace arti::lang { forward(); } + if ((iter - dotIter) == 1) { + /* Revert to dot */ + --iter; + --column; + + return langException( + line, + column, + "digit", + iter == source.end() + ? "EOF" + : std::string{ *(iter + 1) } + ); + } + return Token{ TokenV::tkDecimal, cLine, @@ -390,7 +424,6 @@ namespace arti::lang { } if (*iter == '"') { - forward(); break; } @@ -412,12 +445,23 @@ namespace arti::lang { forward(); } - return Token{ - TokenV::tkString, - cLine, - cColumn, - { stIter, iter } - }; + if (*iter == '"') { + forward(); + + return Token{ + TokenV::tkString, + cLine, + cColumn, + { stIter, iter } + }; + } + + return langException( + line, + column, + "end of string (\")", + "EOF" + ); } Expected Tokenizer::readCharacter() { diff --git a/lib/src/Tokenizer/TokenizerRange.cpp b/lib/src/Tokenizer/TokenizerRange.cpp new file mode 100644 index 0000000..299c816 --- /dev/null +++ b/lib/src/Tokenizer/TokenizerRange.cpp @@ -0,0 +1,84 @@ +#include + +#include + +#include + +namespace arti::lang { + + using Iterator = TokenizerRange::Iterator; + using Sentinel = TokenizerRange::Sentinel; + + TokenizerRange::TokenizerRange(Tokenizer *tokenizer) + : tokenizer(tokenizer) { } + + TokenizerRange::TokenizerRange(TokenizerRange &&other) noexcept + : tokenizer(std::exchange(other.tokenizer, nullptr)) { } + + TokenizerRange &TokenizerRange::operator=(TokenizerRange &&other) noexcept { + this->tokenizer = std::exchange(other.tokenizer, nullptr); + return *this; + } + + Iterator TokenizerRange::begin() { + return Iterator{ this->tokenizer }; + } + + Sentinel TokenizerRange::end() const noexcept { + return Sentinel{}; + } + + Iterator::Iterator(Tokenizer *tokenizer) noexcept + : tokenizer(tokenizer) + , cvalue(tokenizer->peek()) { } + + Iterator::Iterator(Iterator &&other) noexcept + : tokenizer(std::exchange(other.tokenizer, nullptr)) + , cvalue(std::exchange(other.cvalue, {})) { } + + Iterator &Iterator::operator=(Iterator &&other) noexcept { + this->tokenizer = std::exchange(other.tokenizer, nullptr); + this->cvalue = std::exchange(other.cvalue, {}); + return *this; + } + + Iterator &Iterator::operator++() { + if (this->cvalue) { + std::ignore = tokenizer->consume(); + } + this->cvalue = tokenizer->peek(); + return *this; + } + + void Iterator::operator++(int) { + std::ignore = this->operator++(); + } + + Iterator::ReferenceType Iterator::operator*() const noexcept { + return this->cvalue; + } + + Iterator::PointerType Iterator::operator->() const noexcept { + return &this->cvalue; + } + + bool operator==(const Iterator &it, Sentinel) { + if (it.tokenizer->finished()) { + return true; + } + return !it.cvalue.has_value() || it.cvalue->value == TokenV::tkEOF; + } + + bool operator==(Sentinel, const Iterator &it) { + return it == Sentinel{}; + } + + bool operator!=(const Iterator &it, Sentinel) { + return !(it == Sentinel{}); + } + + bool operator!=(Sentinel, const Iterator &it) { + return !(it == Sentinel{}); + } + +} // namespace arti::lang diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 0000000..14b6ab3 --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,20 @@ +include(cmake/dependencies.cmake) + +enable_testing() + +add_library(test INTERFACE) + +target_include_directories( + test INTERFACE + ${CMAKE_CURRENT_SOURCE_DIR}/include +) + +target_link_libraries( + test INTERFACE + artichoke::library + Catch2::Catch2WithMain +) + +include(Catch) + +add_subdirectory(Tokenizer) diff --git a/tests/Tokenizer/CMakeLists.txt b/tests/Tokenizer/CMakeLists.txt new file mode 100644 index 0000000..7b30d1f --- /dev/null +++ b/tests/Tokenizer/CMakeLists.txt @@ -0,0 +1,16 @@ +file(GLOB_RECURSE TOKENIZER_TEST_SRC "src/**.cpp") + +add_executable( + test-tokenizer + ${TOKENIZER_TEST_SRC} +) + +target_link_libraries( + test-tokenizer PRIVATE + test +) + +catch_discover_tests( + test-tokenizer + TEST_PREFIX "Tokenizer." +) diff --git a/tests/Tokenizer/src/Api.cpp b/tests/Tokenizer/src/Api.cpp new file mode 100644 index 0000000..68b6f50 --- /dev/null +++ b/tests/Tokenizer/src/Api.cpp @@ -0,0 +1,99 @@ +#include + +#include +#include + +#include + +namespace lang = arti::lang; + +TEST_CASE("API_PeekOffset", "[api][peek]") { + const std::string source = "a b c"; + lang::Tokenizer tkz{ source }; + + auto t0 = tkz.peek(0); + REQUIRE(t0.has_value()); + REQUIRE(t0->value == lang::TokenV::tkIdentifier); + REQUIRE(t0->strValue == "a"); + + auto t1 = tkz.peek(1); + REQUIRE(t1.has_value()); + REQUIRE(t1->value == lang::TokenV::tkIdentifier); + REQUIRE(t1->strValue == "b"); + + auto t2 = tkz.peek(2); + REQUIRE(t2.has_value()); + REQUIRE(t2->value == lang::TokenV::tkIdentifier); + REQUIRE(t2->strValue == "c"); + + auto t3 = tkz.peek(3); + REQUIRE(t3.has_value()); + REQUIRE(t3->value == lang::TokenV::tkEOF); + + // Ensure nothing was consumed by peeks + REQUIRE_FALSE(tkz.finished()); + auto t0_again = tkz.peek(); + REQUIRE(t0_again.has_value()); + REQUIRE(t0_again->value == lang::TokenV::tkIdentifier); + REQUIRE(t0_again->strValue == "a"); +} + +TEST_CASE("API_ConsumeAndFinishedSemantics", "[api][consume][finished]") { + const std::string source = "x y z"; + lang::Tokenizer tkz{ source }; + + // consume 2 tokens: x, y + REQUIRE(tkz.consume(2).has_value()); + REQUIRE_FALSE(tkz.finished()); + + auto t = tkz.peek(); + REQUIRE(t.has_value()); + REQUIRE(t->value == lang::TokenV::tkIdentifier); + REQUIRE(t->strValue == "z"); + + // consume last token + REQUIRE(tkz.consume().has_value()); + // finished() returns true only after EOF token has been produced + REQUIRE_FALSE(tkz.finished()); + + auto eof = tkz.peek(); + REQUIRE(eof.has_value()); + REQUIRE(eof->value == lang::TokenV::tkEOF); + REQUIRE(tkz.finished()); +} + +TEST_CASE("API_RangeIterationStopsAtEOF", "[api][range]") { + const std::string source = + "let ident := 42 /* skip this */ + 1"; + lang::Tokenizer tkz{ source }; + + std::vector kinds; + std::vector lex; + + for (auto token : tkz.range()) { + REQUIRE(token.has_value()); + kinds.push_back(token->value); + lex.push_back(token->strValue); + } + + // Expected sequence: kwLet, tkIdentifier("ident"), opLabel, tkInteger("42"), opPlus, tkInteger("1") + REQUIRE(kinds.size() == 6); + REQUIRE(kinds[0] == lang::TokenV::kwLet); + REQUIRE(kinds[1] == lang::TokenV::tkIdentifier); + REQUIRE(kinds[2] == lang::TokenV::opLabel); + REQUIRE(kinds[3] == lang::TokenV::tkInteger); + REQUIRE(kinds[4] == lang::TokenV::opPlus); + REQUIRE(kinds[5] == lang::TokenV::tkInteger); + + REQUIRE(lex[0] == "let"); + REQUIRE(lex[1] == "ident"); + REQUIRE(lex[2] == ":="); + REQUIRE(lex[3] == "42"); + REQUIRE(lex[4] == "+"); + REQUIRE(lex[5] == "1"); + + // After iterating the range, peek should yield EOF + auto eof = tkz.peek(); + REQUIRE(eof.has_value()); + REQUIRE(eof->value == lang::TokenV::tkEOF); +} diff --git a/tests/Tokenizer/src/Comments.cpp b/tests/Tokenizer/src/Comments.cpp new file mode 100644 index 0000000..30cbdc6 --- /dev/null +++ b/tests/Tokenizer/src/Comments.cpp @@ -0,0 +1,87 @@ +#include + +#include + +#include +#include + +namespace lang = arti::lang; + +TEST_CASE("Comments_BlockSkipped", "[comments][block][skip]") { + // Ensure that block comments are ignored and do not emit tokens. + const std::string source = "foo /* a block comment with symbols 123 !@# */ bar"; + + lang::Tokenizer tkz{source}; + + auto t1 = tkz.peek(); + REQUIRE(t1.has_value()); + REQUIRE(t1->value == lang::TokenV::tkIdentifier); + REQUIRE(t1->strValue == "foo"); + + REQUIRE(tkz.consume().has_value()); + + auto t2 = tkz.peek(); + REQUIRE(t2.has_value()); + REQUIRE(t2->value == lang::TokenV::tkIdentifier); + REQUIRE(t2->strValue == "bar"); + + REQUIRE(tkz.consume().has_value()); + + auto eof = tkz.peek(); + REQUIRE(eof.has_value()); + REQUIRE(eof->value == lang::TokenV::tkEOF); +} + +TEST_CASE("Comments_BlockMultiline", "[comments][block][multiline]") { + const std::string source = + "alpha /* line1\n" + "line2\n" + "line3 */ beta"; + + lang::Tokenizer tkz{source}; + + auto t1 = tkz.peek(); + REQUIRE(t1.has_value()); + REQUIRE(t1->value == lang::TokenV::tkIdentifier); + REQUIRE(t1->strValue == "alpha"); + REQUIRE(tkz.consume().has_value()); + + auto t2 = tkz.peek(); + REQUIRE(t2.has_value()); + REQUIRE(t2->value == lang::TokenV::tkIdentifier); + REQUIRE(t2->strValue == "beta"); + REQUIRE(tkz.consume().has_value()); + + auto eof = tkz.peek(); + REQUIRE(eof.has_value()); + REQUIRE(eof->value == lang::TokenV::tkEOF); +} + +TEST_CASE("Comments_UnterminatedBlock_Error", "[comments][block][error]") { + // Unterminated block comments should surface an error from the tokenizer. + const std::string source = "foo /* this never ends..."; + + lang::Tokenizer tkz{source}; + + auto t1 = tkz.peek(); + REQUIRE(t1.has_value()); + REQUIRE(t1->value == lang::TokenV::tkIdentifier); + REQUIRE(t1->strValue == "foo"); + REQUIRE(tkz.consume().has_value()); + + auto errTok = tkz.peek(); + REQUIRE_FALSE(errTok.has_value()); + + // Check error message mentions invalid comment. + const auto &err = errTok.error(); + REQUIRE(err.message.find("Invalid comment") != std::string::npos); +} + +TEST_CASE("Comments_SingleLineUnsupported_Skip", "[comments][.line]") { + // The tokenizer currently does NOT support '//' comments. + // Keep this test as a placeholder and mark it skipped to avoid failures. + SKIP("Single-line '//' comments are not supported yet by the tokenizer"); + const std::string source = "foo // comment\n bar"; + lang::Tokenizer tkz{source}; + (void)tkz; // silence unused +} diff --git a/tests/Tokenizer/src/Identifiers.cpp b/tests/Tokenizer/src/Identifiers.cpp new file mode 100644 index 0000000..b34317d --- /dev/null +++ b/tests/Tokenizer/src/Identifiers.cpp @@ -0,0 +1,127 @@ +#include + +#include +#include +#include + +#include +#include + +namespace lang = arti::lang; + +template +static void CommonIdentifiersSuccess( + const std::array &ids +) { + const std::string source = SourceFromTokens(ids); + + std::size_t it = 0; + lang::Tokenizer tkz{ source }; + + for (auto token : tkz.range()) { + REQUIRE(token.has_value()); + REQUIRE(token->value == lang::TokenV::tkIdentifier); + REQUIRE(token->strValue == ids.at(it++)); + } + + REQUIRE(it == ids.size()); + REQUIRE(tkz.peek().has_value()); + REQUIRE(tkz.peek()->value == lang::TokenV::tkEOF); +} + +TEST_CASE("Identifiers_Basic", "[identifiers][valid]") { + constexpr std::array ids = { + "a", "abc", "a_b", "snake_case", "camelCase", "PascalCase", "_id", "with123" + }; + + CommonIdentifiersSuccess(ids); +} + +TEST_CASE("Identifiers_DigitsAfterFirst", "[identifiers][valid]") { + constexpr std::array ids = { + "a1", "abc123", "_a1_b2", "v2", "x9y8z7", "i18n" + }; + + CommonIdentifiersSuccess(ids); +} + +TEST_CASE("Identifiers_Long", "[identifiers][valid][long]") { + // Create a long identifier to ensure tokenizer handles large spans. + std::string longId(512, 'a'); + std::vector toks{ longId }; + + const std::string source = SourceFromTokens(toks); + lang::Tokenizer tkz{ source }; + + auto t = tkz.peek(); + REQUIRE(t.has_value()); + REQUIRE(t->value == lang::TokenV::tkIdentifier); + REQUIRE(t->strValue == longId); + + REQUIRE(tkz.consume().has_value()); + auto eof = tkz.peek(); + REQUIRE(eof.has_value()); + REQUIRE(eof->value == lang::TokenV::tkEOF); +} + +TEST_CASE("Identifiers_WithOperators", "[identifiers][operators]") { + // '$' and '?' are operators, not identifier characters. + // '$foo' -> opMut, tkIdentifier("foo") + // '?bar' -> opOpt, tkIdentifier("bar") + const std::string source = "$foo ?bar"; + + lang::Tokenizer tkz{ source }; + + auto t1 = tkz.peek(); + REQUIRE(t1.has_value()); + REQUIRE(t1->value == lang::TokenV::opMut); + REQUIRE(tkz.consume().has_value()); + + auto t2 = tkz.peek(); + REQUIRE(t2.has_value()); + REQUIRE(t2->value == lang::TokenV::tkIdentifier); + REQUIRE(t2->strValue == "foo"); + REQUIRE(tkz.consume().has_value()); + + auto t3 = tkz.peek(); + REQUIRE(t3.has_value()); + REQUIRE(t3->value == lang::TokenV::opOpt); + REQUIRE(tkz.consume().has_value()); + + auto t4 = tkz.peek(); + REQUIRE(t4.has_value()); + REQUIRE(t4->value == lang::TokenV::tkIdentifier); + REQUIRE(t4->strValue == "bar"); + REQUIRE(tkz.consume().has_value()); + + auto eof = tkz.peek(); + REQUIRE(eof.has_value()); + REQUIRE(eof->value == lang::TokenV::tkEOF); +} + +TEST_CASE("Identifiers_DotAccess", "[identifiers][dot]") { + const std::string source = "foo.bar"; + + lang::Tokenizer tkz{ source }; + + auto t1 = tkz.peek(); + REQUIRE(t1.has_value()); + REQUIRE(t1->value == lang::TokenV::tkIdentifier); + REQUIRE(t1->strValue == "foo"); + REQUIRE(tkz.consume().has_value()); + + auto t2 = tkz.peek(); + REQUIRE(t2.has_value()); + REQUIRE(t2->value == lang::TokenV::opDot); + REQUIRE(tkz.consume().has_value()); + + auto t3 = tkz.peek(); + REQUIRE(t3.has_value()); + REQUIRE(t3->value == lang::TokenV::tkIdentifier); + REQUIRE(t3->strValue == "bar"); + REQUIRE(tkz.consume().has_value()); + + auto eof = tkz.peek(); + REQUIRE(eof.has_value()); + REQUIRE(eof->value == lang::TokenV::tkEOF); +} diff --git a/tests/Tokenizer/src/Keywords.cpp b/tests/Tokenizer/src/Keywords.cpp new file mode 100644 index 0000000..fd85cec --- /dev/null +++ b/tests/Tokenizer/src/Keywords.cpp @@ -0,0 +1,93 @@ +#include + +#include + +#include + +#include + +namespace lang = arti::lang; + +template +static void CommonKeywordsSuccess( + const std::array &lexemes, + const std::array &kinds +) { + static_assert(N > 0, "Must provide at least one keyword"); + const std::string source = SourceFromTokens(lexemes); + + std::size_t it = 0; + lang::Tokenizer tkz{ source }; + + for (auto token : tkz.range()) { + REQUIRE(token.has_value()); + REQUIRE(token->value == kinds.at(it)); + REQUIRE(token->strValue == lexemes.at(it)); + ++it; + } + + REQUIRE(it == lexemes.size()); + REQUIRE(tkz.peek().has_value()); + REQUIRE(tkz.peek()->value == lang::TokenV::tkEOF); +} + +TEST_CASE("Keywords_AllRecognized", "[keywords][valid]") { + constexpr std::array lexemes = { + "_","or","not","and","if","else","fn","enum","struct","def","let","for", + "loop","break","continue","while","match","switch","return","unreachable", + "defer","errdefer","true","false","null","this","import","export","module","using","this" + }; + + constexpr std::array kinds = { + lang::TokenV::kwUnderscore, + lang::TokenV::kwOr, + lang::TokenV::kwNot, + lang::TokenV::kwAnd, + lang::TokenV::kwIf, + lang::TokenV::kwElse, + lang::TokenV::kwFn, + lang::TokenV::kwEnum, + lang::TokenV::kwStruct, + lang::TokenV::kwDef, + lang::TokenV::kwLet, + lang::TokenV::kwFor, + lang::TokenV::kwLoop, + lang::TokenV::kwBreak, + lang::TokenV::kwContinue, + lang::TokenV::kwWhile, + lang::TokenV::kwMatch, + lang::TokenV::kwSwitch, + lang::TokenV::kwReturn, + lang::TokenV::kwUnreachable, + lang::TokenV::kwDefer, + lang::TokenV::kwErrDefer, + lang::TokenV::kwTrue, + lang::TokenV::kwFalse, + lang::TokenV::kwNull, + lang::TokenV::kwThis, + lang::TokenV::kwImport, + lang::TokenV::kwExport, + lang::TokenV::kwModule, + lang::TokenV::kwUsing, + lang::TokenV::kwThis, + }; + + CommonKeywordsSuccess(lexemes, kinds); +} + +TEST_CASE("Keywords_PrecedenceOverIdentifiers", "[keywords][precedence]") { + // Ensure that keywords are recognized as keywords, while longer names remain identifiers. + constexpr std::array lexemes = { + "if", "iff", "return", "returnX", "_", "_id" + }; + constexpr std::array kinds = { + lang::TokenV::kwIf, // "if" is a keyword + lang::TokenV::tkIdentifier,// "iff" should be an identifier + lang::TokenV::kwReturn, // "return" is a keyword + lang::TokenV::tkIdentifier,// "returnX" is not a keyword + lang::TokenV::kwUnderscore,// "_" is a keyword in this language + lang::TokenV::tkIdentifier // "_id" is a regular identifier + }; + + CommonKeywordsSuccess(lexemes, kinds); +} diff --git a/tests/Tokenizer/src/Numbers.cpp b/tests/Tokenizer/src/Numbers.cpp new file mode 100644 index 0000000..1fac2f5 --- /dev/null +++ b/tests/Tokenizer/src/Numbers.cpp @@ -0,0 +1,171 @@ +#include + +#include + +#include + +#include + +namespace lang = arti::lang; + +template +static void CommonIntegersSuccess( + lang::TokenV type, + const std::array &expected +) { + const std::string source = SourceFromTokens(expected); + + std::size_t it = 0; + lang::Tokenizer tkz{ source }; + + for (auto token : tkz.range()) { + REQUIRE(token.has_value()); + REQUIRE(token->value == type); + REQUIRE(token->strValue == expected.at(it++)); + } + + REQUIRE(it == expected.size()); + REQUIRE(tkz.peek().has_value()); + REQUIRE(tkz.peek()->value == lang::TokenV::tkEOF); +} + +TEST_CASE("Numbers_Integers", "[integers][valid]") { + constexpr std::array expected = { + "0", "1", "42", "123456", "98712390", "12381723912465471" + }; + + CommonIntegersSuccess(lang::TokenV::tkInteger, expected); +} + +TEST_CASE("Numbers_HexIntegers", "[integers][valid]") { + constexpr std::array expected = { + "0x831", "0xAFEFE", "0xABEBE", + "0x7a147e8a3", "0x98712390", "0x1d238c18e7ff239a12465471" + }; + + CommonIntegersSuccess(lang::TokenV::tkInteger, expected); +} + +TEST_CASE("Numbers_OctIntegers", "[integers][valid]") { + constexpr std::array expected = { + "041", "064123", "0136237", "012345", "01", "071236571236512631723651" + }; + + CommonIntegersSuccess(lang::TokenV::tkInteger, expected); +} + +TEST_CASE("Numbers_BinIntegers", "[integers][valid]") { + constexpr std::array expected = { + "0b0101101", "0b1", "0b01", "0b0", "0b011010101110101101110101011", + "0b11110101011010101" + }; + + CommonIntegersSuccess(lang::TokenV::tkInteger, expected); +} + +TEST_CASE("Numbers_Decimal", "[decimals][valid]") { + constexpr std::array expected = { + "1.0", "0.5", "3.14159", "10.50", "9999.0001", "1375123476175981.813751235" + }; + + CommonIntegersSuccess(lang::TokenV::tkDecimal, expected); +} + +TEST_CASE("Numbers_UnaryMinusSeparate", "[numbers][unary-minus]") { + const std::string source = "-1 -2.5"; + lang::Tokenizer tkz{ source }; + + auto t1 = tkz.peek(); + REQUIRE(t1.has_value()); + REQUIRE(t1->value == lang::TokenV::opHyphen); + REQUIRE(tkz.consume().has_value()); + + auto t2 = tkz.peek(); + REQUIRE(t2.has_value()); + REQUIRE(t2->value == lang::TokenV::tkInteger); + REQUIRE(t2->strValue == "1"); + REQUIRE(tkz.consume().has_value()); + + auto t3 = tkz.peek(); + REQUIRE(t3.has_value()); + REQUIRE(t3->value == lang::TokenV::opHyphen); + REQUIRE(tkz.consume().has_value()); + + auto t4 = tkz.peek(); + REQUIRE(t4.has_value()); + REQUIRE(t4->value == lang::TokenV::tkDecimal); + REQUIRE(t4->strValue == "2.5"); + REQUIRE(tkz.consume().has_value()); + + auto eof = tkz.peek(); + REQUIRE(eof.has_value()); + REQUIRE(eof->value == lang::TokenV::tkEOF); +} + +TEST_CASE("Numbers_DotBoundaries_Disambiguation", "[numbers][dot][edge]") { + // Expect numbers must start with a digit: + // .5 -> '.' + '5' + // 10. -> ERROR (expects digit after '.'), then '.' token if continued + // 1..2 -> ERROR (expects digit after '.'), then '.' '.' '2' if continued + const std::string source = ".5 10. 1..2"; + lang::Tokenizer tkz{ source }; + + // .5 -> '.' then '5' + auto a1 = tkz.peek(); + REQUIRE(a1.has_value()); + REQUIRE(a1->value == lang::TokenV::opDot); + REQUIRE(tkz.consume().has_value()); + + auto a2 = tkz.peek(); + REQUIRE(a2.has_value()); + REQUIRE(a2->value == lang::TokenV::tkInteger); + REQUIRE(a2->strValue == "5"); + REQUIRE(tkz.consume().has_value()); + + // 10. -> first an error (expects a digit after '.'), then '.' is parsed if we continue + auto err1 = tkz.peek(); + REQUIRE_FALSE(err1.has_value()); + + auto after_err1 = tkz.peek(); + REQUIRE(after_err1.has_value()); + REQUIRE(after_err1->value == lang::TokenV::opDot); + REQUIRE(tkz.consume().has_value()); + + // 1..2 -> first an error (expects a digit after '.'), then '.' '.' '2' + auto err2 = tkz.peek(); + REQUIRE_FALSE(err2.has_value()); + + auto dot1 = tkz.peek(); + REQUIRE(dot1.has_value()); + REQUIRE(dot1->value == lang::TokenV::opDot); + REQUIRE(tkz.consume().has_value()); + + auto dot2 = tkz.peek(); + REQUIRE(dot2.has_value()); + REQUIRE(dot2->value == lang::TokenV::opDot); + REQUIRE(tkz.consume().has_value()); + + auto last = tkz.peek(); + REQUIRE(last.has_value()); + REQUIRE(last->value == lang::TokenV::tkInteger); + REQUIRE(last->strValue == "2"); + REQUIRE(tkz.consume().has_value()); + + auto eof = tkz.peek(); + REQUIRE(eof.has_value()); + REQUIRE(eof->value == lang::TokenV::tkEOF); +} + +TEST_CASE("Numbers_InvalidPrefixes", "[numbers][invalid]") { + const std::array invalids = { "0x", "0b", "0xG", "0b2", "08" }; + + for (auto src : invalids) { + lang::Tokenizer tkz{ std::string{src} }; + auto tok = tkz.peek(); + REQUIRE_FALSE(tok.has_value()); + const auto &err = tok.error(); + REQUIRE( + err.message.find("Invalid literal") != std::string::npos + ); + } +} diff --git a/tests/Tokenizer/src/Operators.cpp b/tests/Tokenizer/src/Operators.cpp new file mode 100644 index 0000000..9955bc0 --- /dev/null +++ b/tests/Tokenizer/src/Operators.cpp @@ -0,0 +1,178 @@ +#include + +#include +#include +#include + +#include +#include + +namespace lang = arti::lang; + +template +static void CommonOpsSuccess( + const std::array &lexemes, + const std::array &kinds +) { + static_assert(N > 0, "Must provide at least one operator"); + const std::string source = SourceFromTokens(lexemes); + + std::size_t it = 0; + lang::Tokenizer tkz{ source }; + + for (auto token : tkz.range()) { + REQUIRE(token.has_value()); + REQUIRE(token->value == kinds.at(it)); + REQUIRE(token->strValue == lexemes.at(it)); + ++it; + } + + REQUIRE(it == lexemes.size()); + REQUIRE(tkz.peek().has_value()); + REQUIRE(tkz.peek()->value == lang::TokenV::tkEOF); +} + +TEST_CASE("Operators_SingleChar", "[operators][single]") { + constexpr std::array lex = { + ".", "%", "+", "-", "/", "!", "*", ":", ",", "=", ";", "^", "~", + "&", "|", "<", ">", "(", ")", "[", "]", "{", "}", "?", "$" + }; + constexpr std::array kinds = { + lang::TokenV::opDot, lang::TokenV::opMod, lang::TokenV::opPlus, lang::TokenV::opHyphen, + lang::TokenV::opSlash, lang::TokenV::opBang, lang::TokenV::opStar, lang::TokenV::opColon, + lang::TokenV::opComma, lang::TokenV::opAssign, lang::TokenV::opSemicolon, lang::TokenV::opCaret, + lang::TokenV::opTilde, lang::TokenV::opAnd, lang::TokenV::opOr, lang::TokenV::opLt, + lang::TokenV::opGt, lang::TokenV::opLParen, lang::TokenV::opRParen, lang::TokenV::opLBracket, + lang::TokenV::opRBracket, lang::TokenV::opLSquirly, lang::TokenV::opRSquirly, lang::TokenV::opOpt, + lang::TokenV::opMut + }; + + CommonOpsSuccess(lex, kinds); +} + +TEST_CASE("Operators_MultiChar", "[operators][multi]") { + constexpr std::array lex = { + "==","!=", "<=", ">=", "<<", ">>", "&&", "||", + "+=", "-=", "*=", "/=", "%=", "&=", "|=", + "<<=", ">>=", "&&=", "||=", "->", "::", ":=" + }; + constexpr std::array kinds = { + lang::TokenV::opEq, lang::TokenV::opNeq, lang::TokenV::opLtEq, lang::TokenV::opGtEq, + lang::TokenV::opLShift, lang::TokenV::opRShift, lang::TokenV::opBoolAnd, lang::TokenV::opBoolOr, + lang::TokenV::opPlusAssign, lang::TokenV::opHyphenAssign, lang::TokenV::opStarAssign, lang::TokenV::opSlashAssign, + lang::TokenV::opModAssign, lang::TokenV::opAndAssign, lang::TokenV::opOrAssign, + lang::TokenV::opLShiftAssign, lang::TokenV::opRShiftAssign, lang::TokenV::opBoolAndAssign, lang::TokenV::opBoolORAssign, + lang::TokenV::opArrow, lang::TokenV::opAccess, lang::TokenV::opLabel + }; + + CommonOpsSuccess(lex, kinds); +} + +TEST_CASE("Operators_DotPrefixedSpecials", "[operators][dot][special]") { + constexpr std::array lex = { + ".#", ".[", ".*", ".@" + }; + constexpr std::array kinds = { + lang::TokenV::opSliceSize, lang::TokenV::opPtrSlice, lang::TokenV::opSlicePtr, lang::TokenV::opReflect + }; + + CommonOpsSuccess(lex, kinds); +} + +TEST_CASE("Operators_GreedyLongestMatch", "[operators][greedy]") { + // Ensure longest valid operator is selected. + constexpr std::array lex = { + ">>=", "<<=", "&&=", "||=", + ">=", "<=", "->", "::" + }; + constexpr std::array kinds = { + lang::TokenV::opRShiftAssign, lang::TokenV::opLShiftAssign, lang::TokenV::opBoolAndAssign, lang::TokenV::opBoolORAssign, + lang::TokenV::opGtEq, lang::TokenV::opLtEq, lang::TokenV::opArrow, lang::TokenV::opAccess + }; + + CommonOpsSuccess(lex, kinds); +} + +TEST_CASE("Operators_BoundariesWhitespace", "[operators][boundaries]") { + // '= =' should not be '==' + const std::string source = "=\n="; + lang::Tokenizer tkz{ source }; + + auto t1 = tkz.peek(); + REQUIRE(t1.has_value()); + REQUIRE(t1->value == lang::TokenV::opAssign); + REQUIRE(tkz.consume().has_value()); + + auto t2 = tkz.peek(); + REQUIRE(t2.has_value()); + REQUIRE(t2->value == lang::TokenV::opAssign); + REQUIRE(tkz.consume().has_value()); + + auto eof = tkz.peek(); + REQUIRE(eof.has_value()); + REQUIRE(eof->value == lang::TokenV::tkEOF); +} + +TEST_CASE("Operators_ContextAccessLabelArrow", "[operators][context]") { + const std::string source = "ns::name := src->field"; + lang::Tokenizer tkz{ source }; + + auto t1 = tkz.peek(); + REQUIRE(t1.has_value()); + REQUIRE(t1->value == lang::TokenV::tkIdentifier); + REQUIRE(t1->strValue == "ns"); + REQUIRE(tkz.consume().has_value()); + + auto t2 = tkz.peek(); + REQUIRE(t2.has_value()); + REQUIRE(t2->value == lang::TokenV::opAccess); + REQUIRE(t2->strValue == "::"); + REQUIRE(tkz.consume().has_value()); + + auto t3 = tkz.peek(); + REQUIRE(t3.has_value()); + REQUIRE(t3->value == lang::TokenV::tkIdentifier); + REQUIRE(t3->strValue == "name"); + REQUIRE(tkz.consume().has_value()); + + auto t4 = tkz.peek(); + REQUIRE(t4.has_value()); + REQUIRE(t4->value == lang::TokenV::opLabel); + REQUIRE(t4->strValue == ":="); + REQUIRE(tkz.consume().has_value()); + + auto t5 = tkz.peek(); + REQUIRE(t5.has_value()); + REQUIRE(t5->value == lang::TokenV::tkIdentifier); + REQUIRE(t5->strValue == "src"); + REQUIRE(tkz.consume().has_value()); + + auto t6 = tkz.peek(); + REQUIRE(t6.has_value()); + REQUIRE(t6->value == lang::TokenV::opArrow); + REQUIRE(t6->strValue == "->"); + REQUIRE(tkz.consume().has_value()); + + auto t7 = tkz.peek(); + REQUIRE(t7.has_value()); + REQUIRE(t7->value == lang::TokenV::tkIdentifier); + REQUIRE(t7->strValue == "field"); + REQUIRE(tkz.consume().has_value()); + + auto eof = tkz.peek(); + REQUIRE(eof.has_value()); + REQUIRE(eof->value == lang::TokenV::tkEOF); +} + +TEST_CASE("Operators_InvalidStandalone_Error", "[operators][error]") { + // '#' and '@' alone are not valid tokens (only .# and .@ are valid). + const std::vector invalids = { "#", "@", "`" }; + + for (const auto &src : invalids) { + lang::Tokenizer tkz{ src }; + auto tok = tkz.peek(); + REQUIRE_FALSE(tok.has_value()); + const auto &err = tok.error(); + REQUIRE(err.message.find("Invalid") != std::string::npos); + } +} diff --git a/tests/Tokenizer/src/Strings.cpp b/tests/Tokenizer/src/Strings.cpp new file mode 100644 index 0000000..a16a57c --- /dev/null +++ b/tests/Tokenizer/src/Strings.cpp @@ -0,0 +1,168 @@ +#include + +#include +#include + +#include +#include +#include + +namespace lang = arti::lang; + +template +static void CommonLiteralsSuccess( + lang::TokenV kind, + const std::array &lexemes +) { + const std::string source = SourceFromTokens(lexemes); + + std::size_t it = 0; + lang::Tokenizer tkz{ source }; + + for (auto token : tkz.range()) { + REQUIRE(token.has_value()); + REQUIRE(token->value == kind); + REQUIRE(token->strValue == lexemes.at(it++)); + } + + REQUIRE(it == lexemes.size()); + REQUIRE(tkz.peek().has_value()); + REQUIRE(tkz.peek()->value == lang::TokenV::tkEOF); +} + +TEST_CASE("Strings_Simple", "[strings][valid]") { + constexpr std::array lexemes = { + R"("a")", + R"("hello")", + R"("with spaces")", + R"("12345")", + R"Q("!@#$%^&*()")Q" + }; + + CommonLiteralsSuccess(lang::TokenV::tkString, lexemes); +} + +TEST_CASE("Strings_Escapes", "[strings][valid][escapes]") { + // Validate common escape sequences remain part of lexeme text. + constexpr std::array lexemes = { + R"("quote: \"")", + R"("backslash: \\")", + R"("newline: \n")", + R"("tab: \t")", + R"("mix: \"\\\n\t")" + }; + + CommonLiteralsSuccess(lang::TokenV::tkString, lexemes); +} + +TEST_CASE("Strings_OperatorsAdjacency", "[strings][operators]") { + // "foo"+"bar" -> tkString, opPlus, tkString + const std::string source = R"("foo"+"bar")"; + lang::Tokenizer tkz{ source }; + + auto t1 = tkz.peek(); + REQUIRE(t1.has_value()); + REQUIRE(t1->value == lang::TokenV::tkString); + REQUIRE(t1->strValue == R"("foo")"); + REQUIRE(tkz.consume().has_value()); + + auto t2 = tkz.peek(); + REQUIRE(t2.has_value()); + REQUIRE(t2->value == lang::TokenV::opPlus); + REQUIRE(tkz.consume().has_value()); + + auto t3 = tkz.peek(); + REQUIRE(t3.has_value()); + REQUIRE(t3->value == lang::TokenV::tkString); + REQUIRE(t3->strValue == R"("bar")"); + REQUIRE(tkz.consume().has_value()); + + auto eof = tkz.peek(); + REQUIRE(eof.has_value()); + REQUIRE(eof->value == lang::TokenV::tkEOF); +} + +TEST_CASE("Strings_Unterminated_Error", "[strings][error]") { + // Missing closing quote should yield an error. + const std::string source = "\"unterminated"; + lang::Tokenizer tkz{ source }; + + auto errTok = tkz.peek(); + REQUIRE_FALSE(errTok.has_value()); + const auto &err = errTok.error(); + + REQUIRE( + err.message.find("Invalid literal") != std::string::npos + ); +} + +TEST_CASE("Chars_Simple", "[chars][valid]") { + constexpr std::array lexemes = { + R"('a')", + R"('Z')", + R"('0')", + R"('_')", + R"('$')" + }; + + CommonLiteralsSuccess(lang::TokenV::tkCharacter, lexemes); +} + +TEST_CASE("Chars_Escapes", "[chars][valid][escapes]") { + constexpr std::array lexemes = { + R"('\n')", + R"('\t')", + R"('\\')", + R"('\'')" + }; + + CommonLiteralsSuccess(lang::TokenV::tkCharacter, lexemes); +} + +TEST_CASE("Chars_Invalid_Empty", "[chars][error]") { + const std::string source = "''"; + lang::Tokenizer tkz{ source }; + + auto errTok = tkz.peek(); + REQUIRE_FALSE(errTok.has_value()); + const auto &err = errTok.error(); + REQUIRE( + err.message.find("Invalid literal") != std::string::npos + ); +} + +TEST_CASE("Chars_Invalid_Multiple", "[chars][error]") { + const std::string source = "'ab'"; + lang::Tokenizer tkz{ source }; + + auto errTok = tkz.peek(); + REQUIRE_FALSE(errTok.has_value()); + const auto &err = errTok.error(); + REQUIRE( + err.message.find("Invalid literal") != std::string::npos + ); +} + +TEST_CASE("Chars_Unterminated", "[chars][error]") { + const std::string source = "'a"; + lang::Tokenizer tkz{ source }; + + auto errTok = tkz.peek(); + REQUIRE_FALSE(errTok.has_value()); + const auto &err = errTok.error(); + REQUIRE( + err.message.find("Invalid literal") != std::string::npos + ); +} + +TEST_CASE("Chars_InvalidEscape", "[chars][error][.escapes]") { + const std::string source = "'\\x'"; + lang::Tokenizer tkz{ source }; + + auto errTok = tkz.peek(); + REQUIRE_FALSE(errTok.has_value()); + const auto &err = errTok.error(); + REQUIRE( + err.message.find("Invalid literal") != std::string::npos + ); +} diff --git a/tests/cmake/dependencies.cmake b/tests/cmake/dependencies.cmake new file mode 100644 index 0000000..67d09cb --- /dev/null +++ b/tests/cmake/dependencies.cmake @@ -0,0 +1,18 @@ +include(FetchContent) + +# Get CPM +file( + DOWNLOAD + https://github.com/cpm-cmake/CPM.cmake/releases/download/v0.40.8/CPM.cmake + ${CMAKE_CURRENT_BINARY_DIR}/cmake/CPM.cmake + EXPECTED_HASH + SHA256=78ba32abdf798bc616bab7c73aac32a17bbd7b06ad9e26a6add69de8f3ae4791 +) +include(${CMAKE_CURRENT_BINARY_DIR}/cmake/CPM.cmake) + +# Get dependencies + +CPMAddPackage("gh:catchorg/Catch2@3.6.0") + +# Include Catch2 CMake scripts +list(APPEND CMAKE_MODULE_PATH ${Catch2_SOURCE_DIR}/extras) diff --git a/tests/include/Utils.hpp b/tests/include/Utils.hpp new file mode 100644 index 0000000..1f958ca --- /dev/null +++ b/tests/include/Utils.hpp @@ -0,0 +1,58 @@ +#pragma once + +#include + +#include + +template +requires( + std::is_convertible_v, std::string_view> and + std::is_same_v, std::ranges::range_value_t> +) +arti::lang::Generator> +InterleaveRanges(R1 &&r1, R2 &&r2) { + auto it1 = std::ranges::begin(r1); + auto end1 = std::ranges::end(r1); + + auto it2 = std::ranges::begin(r2); + auto end2 = std::ranges::end(r2); + + while (it1 != end1 && it2 != end2) { + yield *it1; + ++it1; + yield *it2; + ++it2; + } +} + +static arti::lang::Generator +WhitespaceGenerator(uint32_t maxLen = 5) { + constexpr std::array spaceChars{ ' ', '\t', '\n' }; + + std::string str; + std::random_device device; + std::mt19937 engine(device()); + std::uniform_int_distribution dist(1, maxLen); + std::uniform_int_distribution distChars(0, 2); + + str.reserve(maxLen); + + while (true) { + str.resize(0); + + auto sz = dist(engine); + + for (uint32_t i = 0; i < sz; ++i) { + str += spaceChars[distChars(engine)]; + } + + yield str; + } +} + +template +requires(std::is_same_v, std::string_view>) +static std::string SourceFromTokens(R &&tokens) { + return InterleaveRanges(tokens, WhitespaceGenerator(10)) | std::views::join | + std::ranges::to(); +}