feat(test, tokenizer): Add test suite, in Tokenizer fixed catched issues and range-based API
Signed-off-by: erick-alcachofa <erick@artichoke.dev>
This commit introduces a comprehensive test suite for the tokenizer
using the Catch2 framework. To support this and improve the project
structure, the build system and the tokenizer's API have been
significantly updated.
- Removed `cmake/testing.cmake` as it's no longer needed.
- A new `TokenizerRange` class provides a C++20-style range interface,
allowing for simple `for-each` loop iteration over tokens. This is
used extensively in the new tests.
- The CMake build system has been refactored:
- An `ENABLE_TESTING` option (OFF by default) now controls whether
the test suite is built.
- The core library is now compiled into an object library, which is
then used to produce both a shared (`.so`/`.dll`) and a static
(`.a`/`.lib`) library. This improves build efficiency and provides
more flexible linkage options.
- The frontend executable now links against the static version of
the library.
- Implemented tests for tokenizer using Catch2 framework, covering
various cases like identifiers, keywords, numbers, etc. that already
catched some issues in current implementation.
- Several parsing bugs and edge cases in the tokenizer were fixed,
including the handling of unterminated strings and invalid numeric
literals. The README has been updated with instructions for building
and running tests.
This commit is contained in:
parent
0f8688d3ee
commit
e1b9e054f3
@ -15,19 +15,14 @@ set(PROJECT_URL "lang.artichoke.dev")
|
||||
set(PROJECT_AUTHOR "erick-alcachofa")
|
||||
set(PROJECT_AUTHOR_GITHUB "@erick-alcachofa")
|
||||
|
||||
include(cmake/testing.cmake)
|
||||
option(ENABLE_TESTING "Enable build of tests for library" OFF)
|
||||
|
||||
add_subdirectory(lib)
|
||||
add_subdirectory(frontend)
|
||||
|
||||
install(
|
||||
TARGETS frontend library
|
||||
EXPORT artichokeTargets
|
||||
FILE_SET HEADERS
|
||||
LIBRARY DESTINATION lib
|
||||
ARCHIVE DESTINATION lib
|
||||
TARGETS frontend
|
||||
RUNTIME DESTINATION bin
|
||||
INCLUDES DESTINATION include
|
||||
)
|
||||
|
||||
get_target_property(
|
||||
@ -43,6 +38,16 @@ install(
|
||||
)"
|
||||
)
|
||||
|
||||
install(
|
||||
TARGETS library library_static
|
||||
EXPORT artichokeTargets
|
||||
FILE_SET HEADERS
|
||||
LIBRARY DESTINATION lib
|
||||
ARCHIVE DESTINATION lib
|
||||
RUNTIME DESTINATION bin
|
||||
INCLUDES DESTINATION include
|
||||
)
|
||||
|
||||
install(
|
||||
EXPORT artichokeTargets
|
||||
FILE artichokeTargets.cmake
|
||||
@ -62,3 +67,7 @@ install(
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/artichokeConfigVersion.cmake"
|
||||
DESTINATION lib/cmake/artichoke
|
||||
)
|
||||
|
||||
if(ENABLE_TESTING)
|
||||
add_subdirectory(tests)
|
||||
endif()
|
||||
|
||||
@ -46,6 +46,7 @@ grammar is stable, and the next step is the implementation of a compiler
|
||||
git clone https://git.artichoke.dev/me/artichoke-lang.git
|
||||
|
||||
# Configure cmake
|
||||
# Optionally add -DENABLE_TESTING=ON for building tests
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -S . -B build
|
||||
|
||||
# Build the project
|
||||
@ -54,6 +55,9 @@ cmake --build build
|
||||
# Run the binary
|
||||
./build/frontend/artichoke-c
|
||||
|
||||
# Run the tests if enabled
|
||||
ctest --test-dir build/tests --output-on-failure
|
||||
|
||||
# Install if wanted
|
||||
cmake --install build --prefix=/usr/local
|
||||
|
||||
|
||||
@ -34,5 +34,5 @@ target_include_directories(
|
||||
|
||||
target_link_libraries(
|
||||
frontend PUBLIC
|
||||
library
|
||||
artichoke::library_static
|
||||
)
|
||||
|
||||
@ -5,17 +5,17 @@ file(GLOB_RECURSE ARTI_LIB_HEADERS "include/**.hpp")
|
||||
file(GLOB_RECURSE ARTI_LIB_GEN_HEADERS "${CMAKE_CURRENT_BINARY_DIR}/include/**.hpp")
|
||||
|
||||
add_library(
|
||||
library SHARED
|
||||
objs OBJECT
|
||||
${ARTI_LIB_SOURCES}
|
||||
)
|
||||
|
||||
set_target_properties(
|
||||
library PROPERTIES
|
||||
OUTPUT_NAME "artichoke"
|
||||
objs PROPERTIES
|
||||
POSITION_INDEPENDENT_CODE 1
|
||||
)
|
||||
|
||||
target_compile_options(
|
||||
library PRIVATE
|
||||
objs PRIVATE
|
||||
-pedantic
|
||||
-Wall
|
||||
-Wextra
|
||||
@ -30,24 +30,58 @@ target_compile_options(
|
||||
-Wno-unused
|
||||
)
|
||||
|
||||
target_sources(
|
||||
library PUBLIC
|
||||
FILE_SET HEADERS
|
||||
BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/include
|
||||
FILES
|
||||
${ARTI_LIB_HEADERS}
|
||||
)
|
||||
|
||||
target_sources(
|
||||
library PUBLIC
|
||||
FILE_SET HEADERS
|
||||
BASE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/include
|
||||
FILES
|
||||
${ARTI_LIB_GEN_HEADERS}
|
||||
)
|
||||
|
||||
target_include_directories(
|
||||
library PUBLIC
|
||||
objs PUBLIC
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
|
||||
$<INSTALL_INTERFACE:>
|
||||
)
|
||||
|
||||
add_library(
|
||||
library SHARED
|
||||
$<TARGET_OBJECTS:objs>
|
||||
)
|
||||
|
||||
add_library(
|
||||
artichoke::library ALIAS
|
||||
library
|
||||
)
|
||||
|
||||
set_target_properties(
|
||||
library PROPERTIES
|
||||
OUTPUT_NAME "artichoke"
|
||||
)
|
||||
|
||||
add_library(
|
||||
library_static STATIC
|
||||
$<TARGET_OBJECTS:objs>
|
||||
)
|
||||
|
||||
add_library(
|
||||
artichoke::library_static ALIAS
|
||||
library_static
|
||||
)
|
||||
|
||||
set_target_properties(
|
||||
library_static PROPERTIES
|
||||
OUTPUT_NAME "artichoke"
|
||||
)
|
||||
|
||||
set(LIB_TARGETS objs library library_static)
|
||||
|
||||
foreach(TGET IN LISTS LIB_TARGETS)
|
||||
target_sources(
|
||||
${TGET} INTERFACE
|
||||
FILE_SET HEADERS
|
||||
BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/include
|
||||
FILES
|
||||
${ARTI_LIB_HEADERS}
|
||||
)
|
||||
|
||||
target_sources(
|
||||
${TGET} INTERFACE
|
||||
FILE_SET HEADERS
|
||||
BASE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/include
|
||||
FILES
|
||||
${ARTI_LIB_GEN_HEADERS}
|
||||
)
|
||||
endforeach()
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
#pragma once
|
||||
|
||||
#include <deque>
|
||||
#include <vector>
|
||||
|
||||
#include <artichoke/Util/Expected.hpp>
|
||||
#include <artichoke/Coroutine/Generator.hpp>
|
||||
|
||||
#include <artichoke/Tokenizer/Token.hpp>
|
||||
#include <artichoke/Tokenizer/TokenizerRange.hpp>
|
||||
|
||||
namespace arti::lang {
|
||||
|
||||
@ -30,6 +30,8 @@ namespace arti::lang {
|
||||
|
||||
void swap(Tokenizer &other) noexcept;
|
||||
|
||||
TokenizerRange range() noexcept;
|
||||
|
||||
private:
|
||||
Generator<Expected<Token>> tokenize();
|
||||
|
||||
|
||||
71
lib/include/artichoke/Tokenizer/TokenizerRange.hpp
Normal file
71
lib/include/artichoke/Tokenizer/TokenizerRange.hpp
Normal file
@ -0,0 +1,71 @@
|
||||
#pragma once
|
||||
|
||||
#include <artichoke/Util/Expected.hpp>
|
||||
#include <artichoke/Tokenizer/Token.hpp>
|
||||
|
||||
namespace arti::lang {
|
||||
struct Tokenizer;
|
||||
|
||||
struct [[nodiscard]] TokenizerRange {
|
||||
friend struct Tokenizer;
|
||||
struct Iterator;
|
||||
struct Sentinel;
|
||||
|
||||
using iterator_type = Iterator;
|
||||
using sentinel_type = Sentinel;
|
||||
|
||||
TokenizerRange(TokenizerRange &&) noexcept;
|
||||
TokenizerRange &operator=(TokenizerRange &&) noexcept;
|
||||
|
||||
TokenizerRange(const TokenizerRange &) noexcept = delete;
|
||||
TokenizerRange &operator=(const TokenizerRange &) noexcept = delete;
|
||||
|
||||
Iterator begin();
|
||||
Sentinel end() const noexcept;
|
||||
|
||||
struct Iterator {
|
||||
friend struct TokenizerRange;
|
||||
using iterator_category = std::input_iterator_tag;
|
||||
using difference_type = std::ptrdiff_t;
|
||||
|
||||
using ValueType = Expected<Token>;
|
||||
using ReferenceType = ValueType &;
|
||||
using PointerType = ValueType *;
|
||||
|
||||
using value_type = ValueType;
|
||||
using pointer_type = PointerType;
|
||||
|
||||
Iterator(Iterator &&) noexcept;
|
||||
Iterator &operator=(Iterator &&) noexcept;
|
||||
|
||||
Iterator(const Iterator &) noexcept = delete;
|
||||
Iterator &operator=(const Iterator &) noexcept = delete;
|
||||
|
||||
Iterator &operator++();
|
||||
|
||||
void operator++(int);
|
||||
|
||||
ReferenceType operator*() const noexcept;
|
||||
PointerType operator->() const noexcept;
|
||||
|
||||
friend bool operator==(const Iterator &, Sentinel);
|
||||
friend bool operator==(Sentinel, const Iterator &);
|
||||
|
||||
friend bool operator!=(const Iterator &, Sentinel);
|
||||
friend bool operator!=(Sentinel, const Iterator &);
|
||||
|
||||
private:
|
||||
Iterator(Tokenizer *tokenizer) noexcept;
|
||||
|
||||
Tokenizer *tokenizer;
|
||||
mutable Expected<Token> cvalue;
|
||||
};
|
||||
|
||||
struct Sentinel {};
|
||||
|
||||
private:
|
||||
TokenizerRange(Tokenizer *tokenizer);
|
||||
|
||||
Tokenizer *tokenizer;
|
||||
};
|
||||
}
|
||||
@ -1,8 +1,8 @@
|
||||
#include <artichoke/Tokenizer/Tokenizer.hpp>
|
||||
|
||||
#include <print>
|
||||
#include <utility>
|
||||
|
||||
#include <artichoke/Tokenizer/TokenizerRange.hpp>
|
||||
#include <artichoke/Util/Strings.hpp>
|
||||
#include <artichoke/Util/Demangle.hpp>
|
||||
#include <artichoke/Util/TrieMap.hpp>
|
||||
@ -36,8 +36,22 @@ namespace arti::lang {
|
||||
return *this;
|
||||
}
|
||||
|
||||
TokenizerRange Tokenizer::range() noexcept {
|
||||
return TokenizerRange{ this };
|
||||
}
|
||||
|
||||
Expected<void> Tokenizer::consume(std::size_t n) noexcept {
|
||||
while (not tokensBuffer.empty()) {
|
||||
while (n > 0) {
|
||||
if (tokensBuffer.empty()) {
|
||||
if (auto ok = peek(); ! ok) {
|
||||
return Unexpected<>{ ok.error() };
|
||||
}
|
||||
|
||||
if (finished()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
tokensBuffer.pop_front();
|
||||
n -= 1;
|
||||
}
|
||||
@ -94,7 +108,11 @@ namespace arti::lang {
|
||||
|
||||
bool Tokenizer::finished() const noexcept {
|
||||
if (tokensGenerator.finished()) {
|
||||
return tokensBuffer.empty();
|
||||
if (!tokensBuffer.empty()) {
|
||||
return tokensBuffer.front().value == TokenV::tkEOF;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
@ -340,6 +358,7 @@ namespace arti::lang {
|
||||
}
|
||||
|
||||
if (*iter == '.') {
|
||||
auto dotIter = iter;
|
||||
forward();
|
||||
|
||||
while (iter != source.end()) {
|
||||
@ -350,6 +369,21 @@ namespace arti::lang {
|
||||
forward();
|
||||
}
|
||||
|
||||
if ((iter - dotIter) == 1) {
|
||||
/* Revert to dot */
|
||||
--iter;
|
||||
--column;
|
||||
|
||||
return langException<ExceptCode::ecInvalidLiteral>(
|
||||
line,
|
||||
column,
|
||||
"digit",
|
||||
iter == source.end()
|
||||
? "EOF"
|
||||
: std::string{ *(iter + 1) }
|
||||
);
|
||||
}
|
||||
|
||||
return Token{
|
||||
TokenV::tkDecimal,
|
||||
cLine,
|
||||
@ -390,7 +424,6 @@ namespace arti::lang {
|
||||
}
|
||||
|
||||
if (*iter == '"') {
|
||||
forward();
|
||||
break;
|
||||
}
|
||||
|
||||
@ -412,12 +445,23 @@ namespace arti::lang {
|
||||
forward();
|
||||
}
|
||||
|
||||
return Token{
|
||||
TokenV::tkString,
|
||||
cLine,
|
||||
cColumn,
|
||||
{ stIter, iter }
|
||||
};
|
||||
if (*iter == '"') {
|
||||
forward();
|
||||
|
||||
return Token{
|
||||
TokenV::tkString,
|
||||
cLine,
|
||||
cColumn,
|
||||
{ stIter, iter }
|
||||
};
|
||||
}
|
||||
|
||||
return langException<ExceptCode::ecInvalidLiteral>(
|
||||
line,
|
||||
column,
|
||||
"end of string (\")",
|
||||
"EOF"
|
||||
);
|
||||
}
|
||||
|
||||
Expected<Token> Tokenizer::readCharacter() {
|
||||
|
||||
84
lib/src/Tokenizer/TokenizerRange.cpp
Normal file
84
lib/src/Tokenizer/TokenizerRange.cpp
Normal file
@ -0,0 +1,84 @@
|
||||
#include <artichoke/Tokenizer/TokenizerRange.hpp>
|
||||
|
||||
#include <utility>
|
||||
|
||||
#include <artichoke/Tokenizer/Tokenizer.hpp>
|
||||
|
||||
namespace arti::lang {
|
||||
|
||||
using Iterator = TokenizerRange::Iterator;
|
||||
using Sentinel = TokenizerRange::Sentinel;
|
||||
|
||||
TokenizerRange::TokenizerRange(Tokenizer *tokenizer)
|
||||
: tokenizer(tokenizer) { }
|
||||
|
||||
TokenizerRange::TokenizerRange(TokenizerRange &&other) noexcept
|
||||
: tokenizer(std::exchange(other.tokenizer, nullptr)) { }
|
||||
|
||||
TokenizerRange &TokenizerRange::operator=(TokenizerRange &&other) noexcept {
|
||||
this->tokenizer = std::exchange(other.tokenizer, nullptr);
|
||||
return *this;
|
||||
}
|
||||
|
||||
Iterator TokenizerRange::begin() {
|
||||
return Iterator{ this->tokenizer };
|
||||
}
|
||||
|
||||
Sentinel TokenizerRange::end() const noexcept {
|
||||
return Sentinel{};
|
||||
}
|
||||
|
||||
Iterator::Iterator(Tokenizer *tokenizer) noexcept
|
||||
: tokenizer(tokenizer)
|
||||
, cvalue(tokenizer->peek()) { }
|
||||
|
||||
Iterator::Iterator(Iterator &&other) noexcept
|
||||
: tokenizer(std::exchange(other.tokenizer, nullptr))
|
||||
, cvalue(std::exchange(other.cvalue, {})) { }
|
||||
|
||||
Iterator &Iterator::operator=(Iterator &&other) noexcept {
|
||||
this->tokenizer = std::exchange(other.tokenizer, nullptr);
|
||||
this->cvalue = std::exchange(other.cvalue, {});
|
||||
return *this;
|
||||
}
|
||||
|
||||
Iterator &Iterator::operator++() {
|
||||
if (this->cvalue) {
|
||||
std::ignore = tokenizer->consume();
|
||||
}
|
||||
this->cvalue = tokenizer->peek();
|
||||
return *this;
|
||||
}
|
||||
|
||||
void Iterator::operator++(int) {
|
||||
std::ignore = this->operator++();
|
||||
}
|
||||
|
||||
Iterator::ReferenceType Iterator::operator*() const noexcept {
|
||||
return this->cvalue;
|
||||
}
|
||||
|
||||
Iterator::PointerType Iterator::operator->() const noexcept {
|
||||
return &this->cvalue;
|
||||
}
|
||||
|
||||
bool operator==(const Iterator &it, Sentinel) {
|
||||
if (it.tokenizer->finished()) {
|
||||
return true;
|
||||
}
|
||||
return !it.cvalue.has_value() || it.cvalue->value == TokenV::tkEOF;
|
||||
}
|
||||
|
||||
bool operator==(Sentinel, const Iterator &it) {
|
||||
return it == Sentinel{};
|
||||
}
|
||||
|
||||
bool operator!=(const Iterator &it, Sentinel) {
|
||||
return !(it == Sentinel{});
|
||||
}
|
||||
|
||||
bool operator!=(Sentinel, const Iterator &it) {
|
||||
return !(it == Sentinel{});
|
||||
}
|
||||
|
||||
} // namespace arti::lang
|
||||
20
tests/CMakeLists.txt
Normal file
20
tests/CMakeLists.txt
Normal file
@ -0,0 +1,20 @@
|
||||
include(cmake/dependencies.cmake)
|
||||
|
||||
enable_testing()
|
||||
|
||||
add_library(test INTERFACE)
|
||||
|
||||
target_include_directories(
|
||||
test INTERFACE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/include
|
||||
)
|
||||
|
||||
target_link_libraries(
|
||||
test INTERFACE
|
||||
artichoke::library
|
||||
Catch2::Catch2WithMain
|
||||
)
|
||||
|
||||
include(Catch)
|
||||
|
||||
add_subdirectory(Tokenizer)
|
||||
16
tests/Tokenizer/CMakeLists.txt
Normal file
16
tests/Tokenizer/CMakeLists.txt
Normal file
@ -0,0 +1,16 @@
|
||||
file(GLOB_RECURSE TOKENIZER_TEST_SRC "src/**.cpp")
|
||||
|
||||
add_executable(
|
||||
test-tokenizer
|
||||
${TOKENIZER_TEST_SRC}
|
||||
)
|
||||
|
||||
target_link_libraries(
|
||||
test-tokenizer PRIVATE
|
||||
test
|
||||
)
|
||||
|
||||
catch_discover_tests(
|
||||
test-tokenizer
|
||||
TEST_PREFIX "Tokenizer."
|
||||
)
|
||||
99
tests/Tokenizer/src/Api.cpp
Normal file
99
tests/Tokenizer/src/Api.cpp
Normal file
@ -0,0 +1,99 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <artichoke/Tokenizer/Tokenizer.hpp>
|
||||
|
||||
namespace lang = arti::lang;
|
||||
|
||||
TEST_CASE("API_PeekOffset", "[api][peek]") {
|
||||
const std::string source = "a b c";
|
||||
lang::Tokenizer tkz{ source };
|
||||
|
||||
auto t0 = tkz.peek(0);
|
||||
REQUIRE(t0.has_value());
|
||||
REQUIRE(t0->value == lang::TokenV::tkIdentifier);
|
||||
REQUIRE(t0->strValue == "a");
|
||||
|
||||
auto t1 = tkz.peek(1);
|
||||
REQUIRE(t1.has_value());
|
||||
REQUIRE(t1->value == lang::TokenV::tkIdentifier);
|
||||
REQUIRE(t1->strValue == "b");
|
||||
|
||||
auto t2 = tkz.peek(2);
|
||||
REQUIRE(t2.has_value());
|
||||
REQUIRE(t2->value == lang::TokenV::tkIdentifier);
|
||||
REQUIRE(t2->strValue == "c");
|
||||
|
||||
auto t3 = tkz.peek(3);
|
||||
REQUIRE(t3.has_value());
|
||||
REQUIRE(t3->value == lang::TokenV::tkEOF);
|
||||
|
||||
// Ensure nothing was consumed by peeks
|
||||
REQUIRE_FALSE(tkz.finished());
|
||||
auto t0_again = tkz.peek();
|
||||
REQUIRE(t0_again.has_value());
|
||||
REQUIRE(t0_again->value == lang::TokenV::tkIdentifier);
|
||||
REQUIRE(t0_again->strValue == "a");
|
||||
}
|
||||
|
||||
TEST_CASE("API_ConsumeAndFinishedSemantics", "[api][consume][finished]") {
|
||||
const std::string source = "x y z";
|
||||
lang::Tokenizer tkz{ source };
|
||||
|
||||
// consume 2 tokens: x, y
|
||||
REQUIRE(tkz.consume(2).has_value());
|
||||
REQUIRE_FALSE(tkz.finished());
|
||||
|
||||
auto t = tkz.peek();
|
||||
REQUIRE(t.has_value());
|
||||
REQUIRE(t->value == lang::TokenV::tkIdentifier);
|
||||
REQUIRE(t->strValue == "z");
|
||||
|
||||
// consume last token
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
// finished() returns true only after EOF token has been produced
|
||||
REQUIRE_FALSE(tkz.finished());
|
||||
|
||||
auto eof = tkz.peek();
|
||||
REQUIRE(eof.has_value());
|
||||
REQUIRE(eof->value == lang::TokenV::tkEOF);
|
||||
REQUIRE(tkz.finished());
|
||||
}
|
||||
|
||||
TEST_CASE("API_RangeIterationStopsAtEOF", "[api][range]") {
|
||||
const std::string source =
|
||||
"let ident := 42 /* skip this */ + 1";
|
||||
lang::Tokenizer tkz{ source };
|
||||
|
||||
std::vector<lang::TokenV> kinds;
|
||||
std::vector<std::string_view> lex;
|
||||
|
||||
for (auto token : tkz.range()) {
|
||||
REQUIRE(token.has_value());
|
||||
kinds.push_back(token->value);
|
||||
lex.push_back(token->strValue);
|
||||
}
|
||||
|
||||
// Expected sequence: kwLet, tkIdentifier("ident"), opLabel, tkInteger("42"), opPlus, tkInteger("1")
|
||||
REQUIRE(kinds.size() == 6);
|
||||
REQUIRE(kinds[0] == lang::TokenV::kwLet);
|
||||
REQUIRE(kinds[1] == lang::TokenV::tkIdentifier);
|
||||
REQUIRE(kinds[2] == lang::TokenV::opLabel);
|
||||
REQUIRE(kinds[3] == lang::TokenV::tkInteger);
|
||||
REQUIRE(kinds[4] == lang::TokenV::opPlus);
|
||||
REQUIRE(kinds[5] == lang::TokenV::tkInteger);
|
||||
|
||||
REQUIRE(lex[0] == "let");
|
||||
REQUIRE(lex[1] == "ident");
|
||||
REQUIRE(lex[2] == ":=");
|
||||
REQUIRE(lex[3] == "42");
|
||||
REQUIRE(lex[4] == "+");
|
||||
REQUIRE(lex[5] == "1");
|
||||
|
||||
// After iterating the range, peek should yield EOF
|
||||
auto eof = tkz.peek();
|
||||
REQUIRE(eof.has_value());
|
||||
REQUIRE(eof->value == lang::TokenV::tkEOF);
|
||||
}
|
||||
87
tests/Tokenizer/src/Comments.cpp
Normal file
87
tests/Tokenizer/src/Comments.cpp
Normal file
@ -0,0 +1,87 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
|
||||
#include <string>
|
||||
|
||||
#include <artichoke/Tokenizer/Tokenizer.hpp>
|
||||
#include <artichoke/Util/Expected.hpp>
|
||||
|
||||
namespace lang = arti::lang;
|
||||
|
||||
TEST_CASE("Comments_BlockSkipped", "[comments][block][skip]") {
|
||||
// Ensure that block comments are ignored and do not emit tokens.
|
||||
const std::string source = "foo /* a block comment with symbols 123 !@# */ bar";
|
||||
|
||||
lang::Tokenizer tkz{source};
|
||||
|
||||
auto t1 = tkz.peek();
|
||||
REQUIRE(t1.has_value());
|
||||
REQUIRE(t1->value == lang::TokenV::tkIdentifier);
|
||||
REQUIRE(t1->strValue == "foo");
|
||||
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto t2 = tkz.peek();
|
||||
REQUIRE(t2.has_value());
|
||||
REQUIRE(t2->value == lang::TokenV::tkIdentifier);
|
||||
REQUIRE(t2->strValue == "bar");
|
||||
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto eof = tkz.peek();
|
||||
REQUIRE(eof.has_value());
|
||||
REQUIRE(eof->value == lang::TokenV::tkEOF);
|
||||
}
|
||||
|
||||
TEST_CASE("Comments_BlockMultiline", "[comments][block][multiline]") {
|
||||
const std::string source =
|
||||
"alpha /* line1\n"
|
||||
"line2\n"
|
||||
"line3 */ beta";
|
||||
|
||||
lang::Tokenizer tkz{source};
|
||||
|
||||
auto t1 = tkz.peek();
|
||||
REQUIRE(t1.has_value());
|
||||
REQUIRE(t1->value == lang::TokenV::tkIdentifier);
|
||||
REQUIRE(t1->strValue == "alpha");
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto t2 = tkz.peek();
|
||||
REQUIRE(t2.has_value());
|
||||
REQUIRE(t2->value == lang::TokenV::tkIdentifier);
|
||||
REQUIRE(t2->strValue == "beta");
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto eof = tkz.peek();
|
||||
REQUIRE(eof.has_value());
|
||||
REQUIRE(eof->value == lang::TokenV::tkEOF);
|
||||
}
|
||||
|
||||
TEST_CASE("Comments_UnterminatedBlock_Error", "[comments][block][error]") {
|
||||
// Unterminated block comments should surface an error from the tokenizer.
|
||||
const std::string source = "foo /* this never ends...";
|
||||
|
||||
lang::Tokenizer tkz{source};
|
||||
|
||||
auto t1 = tkz.peek();
|
||||
REQUIRE(t1.has_value());
|
||||
REQUIRE(t1->value == lang::TokenV::tkIdentifier);
|
||||
REQUIRE(t1->strValue == "foo");
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto errTok = tkz.peek();
|
||||
REQUIRE_FALSE(errTok.has_value());
|
||||
|
||||
// Check error message mentions invalid comment.
|
||||
const auto &err = errTok.error();
|
||||
REQUIRE(err.message.find("Invalid comment") != std::string::npos);
|
||||
}
|
||||
|
||||
TEST_CASE("Comments_SingleLineUnsupported_Skip", "[comments][.line]") {
|
||||
// The tokenizer currently does NOT support '//' comments.
|
||||
// Keep this test as a placeholder and mark it skipped to avoid failures.
|
||||
SKIP("Single-line '//' comments are not supported yet by the tokenizer");
|
||||
const std::string source = "foo // comment\n bar";
|
||||
lang::Tokenizer tkz{source};
|
||||
(void)tkz; // silence unused
|
||||
}
|
||||
127
tests/Tokenizer/src/Identifiers.cpp
Normal file
127
tests/Tokenizer/src/Identifiers.cpp
Normal file
@ -0,0 +1,127 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
|
||||
#include <array>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <artichoke/Tokenizer/Tokenizer.hpp>
|
||||
#include <Utils.hpp>
|
||||
|
||||
namespace lang = arti::lang;
|
||||
|
||||
template <std::size_t N>
|
||||
static void CommonIdentifiersSuccess(
|
||||
const std::array<std::string_view, N> &ids
|
||||
) {
|
||||
const std::string source = SourceFromTokens(ids);
|
||||
|
||||
std::size_t it = 0;
|
||||
lang::Tokenizer tkz{ source };
|
||||
|
||||
for (auto token : tkz.range()) {
|
||||
REQUIRE(token.has_value());
|
||||
REQUIRE(token->value == lang::TokenV::tkIdentifier);
|
||||
REQUIRE(token->strValue == ids.at(it++));
|
||||
}
|
||||
|
||||
REQUIRE(it == ids.size());
|
||||
REQUIRE(tkz.peek().has_value());
|
||||
REQUIRE(tkz.peek()->value == lang::TokenV::tkEOF);
|
||||
}
|
||||
|
||||
TEST_CASE("Identifiers_Basic", "[identifiers][valid]") {
|
||||
constexpr std::array<std::string_view, 8> ids = {
|
||||
"a", "abc", "a_b", "snake_case", "camelCase", "PascalCase", "_id", "with123"
|
||||
};
|
||||
|
||||
CommonIdentifiersSuccess(ids);
|
||||
}
|
||||
|
||||
TEST_CASE("Identifiers_DigitsAfterFirst", "[identifiers][valid]") {
|
||||
constexpr std::array<std::string_view, 6> ids = {
|
||||
"a1", "abc123", "_a1_b2", "v2", "x9y8z7", "i18n"
|
||||
};
|
||||
|
||||
CommonIdentifiersSuccess(ids);
|
||||
}
|
||||
|
||||
TEST_CASE("Identifiers_Long", "[identifiers][valid][long]") {
|
||||
// Create a long identifier to ensure tokenizer handles large spans.
|
||||
std::string longId(512, 'a');
|
||||
std::vector<std::string_view> toks{ longId };
|
||||
|
||||
const std::string source = SourceFromTokens(toks);
|
||||
lang::Tokenizer tkz{ source };
|
||||
|
||||
auto t = tkz.peek();
|
||||
REQUIRE(t.has_value());
|
||||
REQUIRE(t->value == lang::TokenV::tkIdentifier);
|
||||
REQUIRE(t->strValue == longId);
|
||||
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
auto eof = tkz.peek();
|
||||
REQUIRE(eof.has_value());
|
||||
REQUIRE(eof->value == lang::TokenV::tkEOF);
|
||||
}
|
||||
|
||||
TEST_CASE("Identifiers_WithOperators", "[identifiers][operators]") {
|
||||
// '$' and '?' are operators, not identifier characters.
|
||||
// '$foo' -> opMut, tkIdentifier("foo")
|
||||
// '?bar' -> opOpt, tkIdentifier("bar")
|
||||
const std::string source = "$foo ?bar";
|
||||
|
||||
lang::Tokenizer tkz{ source };
|
||||
|
||||
auto t1 = tkz.peek();
|
||||
REQUIRE(t1.has_value());
|
||||
REQUIRE(t1->value == lang::TokenV::opMut);
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto t2 = tkz.peek();
|
||||
REQUIRE(t2.has_value());
|
||||
REQUIRE(t2->value == lang::TokenV::tkIdentifier);
|
||||
REQUIRE(t2->strValue == "foo");
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto t3 = tkz.peek();
|
||||
REQUIRE(t3.has_value());
|
||||
REQUIRE(t3->value == lang::TokenV::opOpt);
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto t4 = tkz.peek();
|
||||
REQUIRE(t4.has_value());
|
||||
REQUIRE(t4->value == lang::TokenV::tkIdentifier);
|
||||
REQUIRE(t4->strValue == "bar");
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto eof = tkz.peek();
|
||||
REQUIRE(eof.has_value());
|
||||
REQUIRE(eof->value == lang::TokenV::tkEOF);
|
||||
}
|
||||
|
||||
TEST_CASE("Identifiers_DotAccess", "[identifiers][dot]") {
|
||||
const std::string source = "foo.bar";
|
||||
|
||||
lang::Tokenizer tkz{ source };
|
||||
|
||||
auto t1 = tkz.peek();
|
||||
REQUIRE(t1.has_value());
|
||||
REQUIRE(t1->value == lang::TokenV::tkIdentifier);
|
||||
REQUIRE(t1->strValue == "foo");
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto t2 = tkz.peek();
|
||||
REQUIRE(t2.has_value());
|
||||
REQUIRE(t2->value == lang::TokenV::opDot);
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto t3 = tkz.peek();
|
||||
REQUIRE(t3.has_value());
|
||||
REQUIRE(t3->value == lang::TokenV::tkIdentifier);
|
||||
REQUIRE(t3->strValue == "bar");
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto eof = tkz.peek();
|
||||
REQUIRE(eof.has_value());
|
||||
REQUIRE(eof->value == lang::TokenV::tkEOF);
|
||||
}
|
||||
93
tests/Tokenizer/src/Keywords.cpp
Normal file
93
tests/Tokenizer/src/Keywords.cpp
Normal file
@ -0,0 +1,93 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
|
||||
#include <array>
|
||||
|
||||
#include <artichoke/Tokenizer/Tokenizer.hpp>
|
||||
|
||||
#include <Utils.hpp>
|
||||
|
||||
namespace lang = arti::lang;
|
||||
|
||||
template <std::size_t N>
|
||||
static void CommonKeywordsSuccess(
|
||||
const std::array<std::string_view, N> &lexemes,
|
||||
const std::array<lang::TokenV, N> &kinds
|
||||
) {
|
||||
static_assert(N > 0, "Must provide at least one keyword");
|
||||
const std::string source = SourceFromTokens(lexemes);
|
||||
|
||||
std::size_t it = 0;
|
||||
lang::Tokenizer tkz{ source };
|
||||
|
||||
for (auto token : tkz.range()) {
|
||||
REQUIRE(token.has_value());
|
||||
REQUIRE(token->value == kinds.at(it));
|
||||
REQUIRE(token->strValue == lexemes.at(it));
|
||||
++it;
|
||||
}
|
||||
|
||||
REQUIRE(it == lexemes.size());
|
||||
REQUIRE(tkz.peek().has_value());
|
||||
REQUIRE(tkz.peek()->value == lang::TokenV::tkEOF);
|
||||
}
|
||||
|
||||
TEST_CASE("Keywords_AllRecognized", "[keywords][valid]") {
|
||||
constexpr std::array<std::string_view, 31> lexemes = {
|
||||
"_","or","not","and","if","else","fn","enum","struct","def","let","for",
|
||||
"loop","break","continue","while","match","switch","return","unreachable",
|
||||
"defer","errdefer","true","false","null","this","import","export","module","using","this"
|
||||
};
|
||||
|
||||
constexpr std::array<lang::TokenV, 31> kinds = {
|
||||
lang::TokenV::kwUnderscore,
|
||||
lang::TokenV::kwOr,
|
||||
lang::TokenV::kwNot,
|
||||
lang::TokenV::kwAnd,
|
||||
lang::TokenV::kwIf,
|
||||
lang::TokenV::kwElse,
|
||||
lang::TokenV::kwFn,
|
||||
lang::TokenV::kwEnum,
|
||||
lang::TokenV::kwStruct,
|
||||
lang::TokenV::kwDef,
|
||||
lang::TokenV::kwLet,
|
||||
lang::TokenV::kwFor,
|
||||
lang::TokenV::kwLoop,
|
||||
lang::TokenV::kwBreak,
|
||||
lang::TokenV::kwContinue,
|
||||
lang::TokenV::kwWhile,
|
||||
lang::TokenV::kwMatch,
|
||||
lang::TokenV::kwSwitch,
|
||||
lang::TokenV::kwReturn,
|
||||
lang::TokenV::kwUnreachable,
|
||||
lang::TokenV::kwDefer,
|
||||
lang::TokenV::kwErrDefer,
|
||||
lang::TokenV::kwTrue,
|
||||
lang::TokenV::kwFalse,
|
||||
lang::TokenV::kwNull,
|
||||
lang::TokenV::kwThis,
|
||||
lang::TokenV::kwImport,
|
||||
lang::TokenV::kwExport,
|
||||
lang::TokenV::kwModule,
|
||||
lang::TokenV::kwUsing,
|
||||
lang::TokenV::kwThis,
|
||||
};
|
||||
|
||||
CommonKeywordsSuccess(lexemes, kinds);
|
||||
}
|
||||
|
||||
TEST_CASE("Keywords_PrecedenceOverIdentifiers", "[keywords][precedence]") {
|
||||
// Ensure that keywords are recognized as keywords, while longer names remain identifiers.
|
||||
constexpr std::array<std::string_view, 6> lexemes = {
|
||||
"if", "iff", "return", "returnX", "_", "_id"
|
||||
};
|
||||
constexpr std::array<lang::TokenV, 6> kinds = {
|
||||
lang::TokenV::kwIf, // "if" is a keyword
|
||||
lang::TokenV::tkIdentifier,// "iff" should be an identifier
|
||||
lang::TokenV::kwReturn, // "return" is a keyword
|
||||
lang::TokenV::tkIdentifier,// "returnX" is not a keyword
|
||||
lang::TokenV::kwUnderscore,// "_" is a keyword in this language
|
||||
lang::TokenV::tkIdentifier // "_id" is a regular identifier
|
||||
};
|
||||
|
||||
CommonKeywordsSuccess(lexemes, kinds);
|
||||
}
|
||||
171
tests/Tokenizer/src/Numbers.cpp
Normal file
171
tests/Tokenizer/src/Numbers.cpp
Normal file
@ -0,0 +1,171 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
|
||||
#include <array>
|
||||
|
||||
#include <artichoke/Tokenizer/Tokenizer.hpp>
|
||||
|
||||
#include <Utils.hpp>
|
||||
|
||||
namespace lang = arti::lang;
|
||||
|
||||
template <std::size_t N>
|
||||
static void CommonIntegersSuccess(
|
||||
lang::TokenV type,
|
||||
const std::array<std::string_view, N> &expected
|
||||
) {
|
||||
const std::string source = SourceFromTokens(expected);
|
||||
|
||||
std::size_t it = 0;
|
||||
lang::Tokenizer tkz{ source };
|
||||
|
||||
for (auto token : tkz.range()) {
|
||||
REQUIRE(token.has_value());
|
||||
REQUIRE(token->value == type);
|
||||
REQUIRE(token->strValue == expected.at(it++));
|
||||
}
|
||||
|
||||
REQUIRE(it == expected.size());
|
||||
REQUIRE(tkz.peek().has_value());
|
||||
REQUIRE(tkz.peek()->value == lang::TokenV::tkEOF);
|
||||
}
|
||||
|
||||
TEST_CASE("Numbers_Integers", "[integers][valid]") {
|
||||
constexpr std::array<std::string_view, 6> expected = {
|
||||
"0", "1", "42", "123456", "98712390", "12381723912465471"
|
||||
};
|
||||
|
||||
CommonIntegersSuccess(lang::TokenV::tkInteger, expected);
|
||||
}
|
||||
|
||||
TEST_CASE("Numbers_HexIntegers", "[integers][valid]") {
|
||||
constexpr std::array<std::string_view, 6> expected = {
|
||||
"0x831", "0xAFEFE", "0xABEBE",
|
||||
"0x7a147e8a3", "0x98712390", "0x1d238c18e7ff239a12465471"
|
||||
};
|
||||
|
||||
CommonIntegersSuccess(lang::TokenV::tkInteger, expected);
|
||||
}
|
||||
|
||||
TEST_CASE("Numbers_OctIntegers", "[integers][valid]") {
|
||||
constexpr std::array<std::string_view, 6> expected = {
|
||||
"041", "064123", "0136237", "012345", "01", "071236571236512631723651"
|
||||
};
|
||||
|
||||
CommonIntegersSuccess(lang::TokenV::tkInteger, expected);
|
||||
}
|
||||
|
||||
TEST_CASE("Numbers_BinIntegers", "[integers][valid]") {
|
||||
constexpr std::array<std::string_view, 6> expected = {
|
||||
"0b0101101", "0b1", "0b01", "0b0", "0b011010101110101101110101011",
|
||||
"0b11110101011010101"
|
||||
};
|
||||
|
||||
CommonIntegersSuccess(lang::TokenV::tkInteger, expected);
|
||||
}
|
||||
|
||||
TEST_CASE("Numbers_Decimal", "[decimals][valid]") {
|
||||
constexpr std::array<std::string_view, 6> expected = {
|
||||
"1.0", "0.5", "3.14159", "10.50", "9999.0001", "1375123476175981.813751235"
|
||||
};
|
||||
|
||||
CommonIntegersSuccess(lang::TokenV::tkDecimal, expected);
|
||||
}
|
||||
|
||||
TEST_CASE("Numbers_UnaryMinusSeparate", "[numbers][unary-minus]") {
|
||||
const std::string source = "-1 -2.5";
|
||||
lang::Tokenizer tkz{ source };
|
||||
|
||||
auto t1 = tkz.peek();
|
||||
REQUIRE(t1.has_value());
|
||||
REQUIRE(t1->value == lang::TokenV::opHyphen);
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto t2 = tkz.peek();
|
||||
REQUIRE(t2.has_value());
|
||||
REQUIRE(t2->value == lang::TokenV::tkInteger);
|
||||
REQUIRE(t2->strValue == "1");
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto t3 = tkz.peek();
|
||||
REQUIRE(t3.has_value());
|
||||
REQUIRE(t3->value == lang::TokenV::opHyphen);
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto t4 = tkz.peek();
|
||||
REQUIRE(t4.has_value());
|
||||
REQUIRE(t4->value == lang::TokenV::tkDecimal);
|
||||
REQUIRE(t4->strValue == "2.5");
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto eof = tkz.peek();
|
||||
REQUIRE(eof.has_value());
|
||||
REQUIRE(eof->value == lang::TokenV::tkEOF);
|
||||
}
|
||||
|
||||
TEST_CASE("Numbers_DotBoundaries_Disambiguation", "[numbers][dot][edge]") {
|
||||
// Expect numbers must start with a digit:
|
||||
// .5 -> '.' + '5'
|
||||
// 10. -> ERROR (expects digit after '.'), then '.' token if continued
|
||||
// 1..2 -> ERROR (expects digit after '.'), then '.' '.' '2' if continued
|
||||
const std::string source = ".5 10. 1..2";
|
||||
lang::Tokenizer tkz{ source };
|
||||
|
||||
// .5 -> '.' then '5'
|
||||
auto a1 = tkz.peek();
|
||||
REQUIRE(a1.has_value());
|
||||
REQUIRE(a1->value == lang::TokenV::opDot);
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto a2 = tkz.peek();
|
||||
REQUIRE(a2.has_value());
|
||||
REQUIRE(a2->value == lang::TokenV::tkInteger);
|
||||
REQUIRE(a2->strValue == "5");
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
// 10. -> first an error (expects a digit after '.'), then '.' is parsed if we continue
|
||||
auto err1 = tkz.peek();
|
||||
REQUIRE_FALSE(err1.has_value());
|
||||
|
||||
auto after_err1 = tkz.peek();
|
||||
REQUIRE(after_err1.has_value());
|
||||
REQUIRE(after_err1->value == lang::TokenV::opDot);
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
// 1..2 -> first an error (expects a digit after '.'), then '.' '.' '2'
|
||||
auto err2 = tkz.peek();
|
||||
REQUIRE_FALSE(err2.has_value());
|
||||
|
||||
auto dot1 = tkz.peek();
|
||||
REQUIRE(dot1.has_value());
|
||||
REQUIRE(dot1->value == lang::TokenV::opDot);
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto dot2 = tkz.peek();
|
||||
REQUIRE(dot2.has_value());
|
||||
REQUIRE(dot2->value == lang::TokenV::opDot);
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto last = tkz.peek();
|
||||
REQUIRE(last.has_value());
|
||||
REQUIRE(last->value == lang::TokenV::tkInteger);
|
||||
REQUIRE(last->strValue == "2");
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto eof = tkz.peek();
|
||||
REQUIRE(eof.has_value());
|
||||
REQUIRE(eof->value == lang::TokenV::tkEOF);
|
||||
}
|
||||
|
||||
TEST_CASE("Numbers_InvalidPrefixes", "[numbers][invalid]") {
|
||||
const std::array<const char*, 5> invalids = { "0x", "0b", "0xG", "0b2", "08" };
|
||||
|
||||
for (auto src : invalids) {
|
||||
lang::Tokenizer tkz{ std::string{src} };
|
||||
auto tok = tkz.peek();
|
||||
REQUIRE_FALSE(tok.has_value());
|
||||
const auto &err = tok.error();
|
||||
REQUIRE(
|
||||
err.message.find("Invalid literal") != std::string::npos
|
||||
);
|
||||
}
|
||||
}
|
||||
178
tests/Tokenizer/src/Operators.cpp
Normal file
178
tests/Tokenizer/src/Operators.cpp
Normal file
@ -0,0 +1,178 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
|
||||
#include <array>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <artichoke/Tokenizer/Tokenizer.hpp>
|
||||
#include <Utils.hpp>
|
||||
|
||||
namespace lang = arti::lang;
|
||||
|
||||
template <std::size_t N>
|
||||
static void CommonOpsSuccess(
|
||||
const std::array<std::string_view, N> &lexemes,
|
||||
const std::array<lang::TokenV, N> &kinds
|
||||
) {
|
||||
static_assert(N > 0, "Must provide at least one operator");
|
||||
const std::string source = SourceFromTokens(lexemes);
|
||||
|
||||
std::size_t it = 0;
|
||||
lang::Tokenizer tkz{ source };
|
||||
|
||||
for (auto token : tkz.range()) {
|
||||
REQUIRE(token.has_value());
|
||||
REQUIRE(token->value == kinds.at(it));
|
||||
REQUIRE(token->strValue == lexemes.at(it));
|
||||
++it;
|
||||
}
|
||||
|
||||
REQUIRE(it == lexemes.size());
|
||||
REQUIRE(tkz.peek().has_value());
|
||||
REQUIRE(tkz.peek()->value == lang::TokenV::tkEOF);
|
||||
}
|
||||
|
||||
TEST_CASE("Operators_SingleChar", "[operators][single]") {
|
||||
constexpr std::array<std::string_view, 25> lex = {
|
||||
".", "%", "+", "-", "/", "!", "*", ":", ",", "=", ";", "^", "~",
|
||||
"&", "|", "<", ">", "(", ")", "[", "]", "{", "}", "?", "$"
|
||||
};
|
||||
constexpr std::array<lang::TokenV, 25> kinds = {
|
||||
lang::TokenV::opDot, lang::TokenV::opMod, lang::TokenV::opPlus, lang::TokenV::opHyphen,
|
||||
lang::TokenV::opSlash, lang::TokenV::opBang, lang::TokenV::opStar, lang::TokenV::opColon,
|
||||
lang::TokenV::opComma, lang::TokenV::opAssign, lang::TokenV::opSemicolon, lang::TokenV::opCaret,
|
||||
lang::TokenV::opTilde, lang::TokenV::opAnd, lang::TokenV::opOr, lang::TokenV::opLt,
|
||||
lang::TokenV::opGt, lang::TokenV::opLParen, lang::TokenV::opRParen, lang::TokenV::opLBracket,
|
||||
lang::TokenV::opRBracket, lang::TokenV::opLSquirly, lang::TokenV::opRSquirly, lang::TokenV::opOpt,
|
||||
lang::TokenV::opMut
|
||||
};
|
||||
|
||||
CommonOpsSuccess(lex, kinds);
|
||||
}
|
||||
|
||||
TEST_CASE("Operators_MultiChar", "[operators][multi]") {
|
||||
constexpr std::array<std::string_view, 22> lex = {
|
||||
"==","!=", "<=", ">=", "<<", ">>", "&&", "||",
|
||||
"+=", "-=", "*=", "/=", "%=", "&=", "|=",
|
||||
"<<=", ">>=", "&&=", "||=", "->", "::", ":="
|
||||
};
|
||||
constexpr std::array<lang::TokenV, 22> kinds = {
|
||||
lang::TokenV::opEq, lang::TokenV::opNeq, lang::TokenV::opLtEq, lang::TokenV::opGtEq,
|
||||
lang::TokenV::opLShift, lang::TokenV::opRShift, lang::TokenV::opBoolAnd, lang::TokenV::opBoolOr,
|
||||
lang::TokenV::opPlusAssign, lang::TokenV::opHyphenAssign, lang::TokenV::opStarAssign, lang::TokenV::opSlashAssign,
|
||||
lang::TokenV::opModAssign, lang::TokenV::opAndAssign, lang::TokenV::opOrAssign,
|
||||
lang::TokenV::opLShiftAssign, lang::TokenV::opRShiftAssign, lang::TokenV::opBoolAndAssign, lang::TokenV::opBoolORAssign,
|
||||
lang::TokenV::opArrow, lang::TokenV::opAccess, lang::TokenV::opLabel
|
||||
};
|
||||
|
||||
CommonOpsSuccess(lex, kinds);
|
||||
}
|
||||
|
||||
TEST_CASE("Operators_DotPrefixedSpecials", "[operators][dot][special]") {
|
||||
constexpr std::array<std::string_view, 4> lex = {
|
||||
".#", ".[", ".*", ".@"
|
||||
};
|
||||
constexpr std::array<lang::TokenV, 4> kinds = {
|
||||
lang::TokenV::opSliceSize, lang::TokenV::opPtrSlice, lang::TokenV::opSlicePtr, lang::TokenV::opReflect
|
||||
};
|
||||
|
||||
CommonOpsSuccess(lex, kinds);
|
||||
}
|
||||
|
||||
TEST_CASE("Operators_GreedyLongestMatch", "[operators][greedy]") {
|
||||
// Ensure longest valid operator is selected.
|
||||
constexpr std::array<std::string_view, 8> lex = {
|
||||
">>=", "<<=", "&&=", "||=",
|
||||
">=", "<=", "->", "::"
|
||||
};
|
||||
constexpr std::array<lang::TokenV, 8> kinds = {
|
||||
lang::TokenV::opRShiftAssign, lang::TokenV::opLShiftAssign, lang::TokenV::opBoolAndAssign, lang::TokenV::opBoolORAssign,
|
||||
lang::TokenV::opGtEq, lang::TokenV::opLtEq, lang::TokenV::opArrow, lang::TokenV::opAccess
|
||||
};
|
||||
|
||||
CommonOpsSuccess(lex, kinds);
|
||||
}
|
||||
|
||||
TEST_CASE("Operators_BoundariesWhitespace", "[operators][boundaries]") {
|
||||
// '= =' should not be '=='
|
||||
const std::string source = "=\n=";
|
||||
lang::Tokenizer tkz{ source };
|
||||
|
||||
auto t1 = tkz.peek();
|
||||
REQUIRE(t1.has_value());
|
||||
REQUIRE(t1->value == lang::TokenV::opAssign);
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto t2 = tkz.peek();
|
||||
REQUIRE(t2.has_value());
|
||||
REQUIRE(t2->value == lang::TokenV::opAssign);
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto eof = tkz.peek();
|
||||
REQUIRE(eof.has_value());
|
||||
REQUIRE(eof->value == lang::TokenV::tkEOF);
|
||||
}
|
||||
|
||||
TEST_CASE("Operators_ContextAccessLabelArrow", "[operators][context]") {
|
||||
const std::string source = "ns::name := src->field";
|
||||
lang::Tokenizer tkz{ source };
|
||||
|
||||
auto t1 = tkz.peek();
|
||||
REQUIRE(t1.has_value());
|
||||
REQUIRE(t1->value == lang::TokenV::tkIdentifier);
|
||||
REQUIRE(t1->strValue == "ns");
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto t2 = tkz.peek();
|
||||
REQUIRE(t2.has_value());
|
||||
REQUIRE(t2->value == lang::TokenV::opAccess);
|
||||
REQUIRE(t2->strValue == "::");
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto t3 = tkz.peek();
|
||||
REQUIRE(t3.has_value());
|
||||
REQUIRE(t3->value == lang::TokenV::tkIdentifier);
|
||||
REQUIRE(t3->strValue == "name");
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto t4 = tkz.peek();
|
||||
REQUIRE(t4.has_value());
|
||||
REQUIRE(t4->value == lang::TokenV::opLabel);
|
||||
REQUIRE(t4->strValue == ":=");
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto t5 = tkz.peek();
|
||||
REQUIRE(t5.has_value());
|
||||
REQUIRE(t5->value == lang::TokenV::tkIdentifier);
|
||||
REQUIRE(t5->strValue == "src");
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto t6 = tkz.peek();
|
||||
REQUIRE(t6.has_value());
|
||||
REQUIRE(t6->value == lang::TokenV::opArrow);
|
||||
REQUIRE(t6->strValue == "->");
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto t7 = tkz.peek();
|
||||
REQUIRE(t7.has_value());
|
||||
REQUIRE(t7->value == lang::TokenV::tkIdentifier);
|
||||
REQUIRE(t7->strValue == "field");
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto eof = tkz.peek();
|
||||
REQUIRE(eof.has_value());
|
||||
REQUIRE(eof->value == lang::TokenV::tkEOF);
|
||||
}
|
||||
|
||||
TEST_CASE("Operators_InvalidStandalone_Error", "[operators][error]") {
|
||||
// '#' and '@' alone are not valid tokens (only .# and .@ are valid).
|
||||
const std::vector<std::string> invalids = { "#", "@", "`" };
|
||||
|
||||
for (const auto &src : invalids) {
|
||||
lang::Tokenizer tkz{ src };
|
||||
auto tok = tkz.peek();
|
||||
REQUIRE_FALSE(tok.has_value());
|
||||
const auto &err = tok.error();
|
||||
REQUIRE(err.message.find("Invalid") != std::string::npos);
|
||||
}
|
||||
}
|
||||
168
tests/Tokenizer/src/Strings.cpp
Normal file
168
tests/Tokenizer/src/Strings.cpp
Normal file
@ -0,0 +1,168 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
|
||||
#include <array>
|
||||
#include <string>
|
||||
|
||||
#include <artichoke/Tokenizer/Tokenizer.hpp>
|
||||
#include <artichoke/Util/Expected.hpp>
|
||||
#include <Utils.hpp>
|
||||
|
||||
namespace lang = arti::lang;
|
||||
|
||||
template <std::size_t N>
|
||||
static void CommonLiteralsSuccess(
|
||||
lang::TokenV kind,
|
||||
const std::array<std::string_view, N> &lexemes
|
||||
) {
|
||||
const std::string source = SourceFromTokens(lexemes);
|
||||
|
||||
std::size_t it = 0;
|
||||
lang::Tokenizer tkz{ source };
|
||||
|
||||
for (auto token : tkz.range()) {
|
||||
REQUIRE(token.has_value());
|
||||
REQUIRE(token->value == kind);
|
||||
REQUIRE(token->strValue == lexemes.at(it++));
|
||||
}
|
||||
|
||||
REQUIRE(it == lexemes.size());
|
||||
REQUIRE(tkz.peek().has_value());
|
||||
REQUIRE(tkz.peek()->value == lang::TokenV::tkEOF);
|
||||
}
|
||||
|
||||
TEST_CASE("Strings_Simple", "[strings][valid]") {
|
||||
constexpr std::array<std::string_view, 5> lexemes = {
|
||||
R"("a")",
|
||||
R"("hello")",
|
||||
R"("with spaces")",
|
||||
R"("12345")",
|
||||
R"Q("!@#$%^&*()")Q"
|
||||
};
|
||||
|
||||
CommonLiteralsSuccess(lang::TokenV::tkString, lexemes);
|
||||
}
|
||||
|
||||
TEST_CASE("Strings_Escapes", "[strings][valid][escapes]") {
|
||||
// Validate common escape sequences remain part of lexeme text.
|
||||
constexpr std::array<std::string_view, 5> lexemes = {
|
||||
R"("quote: \"")",
|
||||
R"("backslash: \\")",
|
||||
R"("newline: \n")",
|
||||
R"("tab: \t")",
|
||||
R"("mix: \"\\\n\t")"
|
||||
};
|
||||
|
||||
CommonLiteralsSuccess(lang::TokenV::tkString, lexemes);
|
||||
}
|
||||
|
||||
TEST_CASE("Strings_OperatorsAdjacency", "[strings][operators]") {
|
||||
// "foo"+"bar" -> tkString, opPlus, tkString
|
||||
const std::string source = R"("foo"+"bar")";
|
||||
lang::Tokenizer tkz{ source };
|
||||
|
||||
auto t1 = tkz.peek();
|
||||
REQUIRE(t1.has_value());
|
||||
REQUIRE(t1->value == lang::TokenV::tkString);
|
||||
REQUIRE(t1->strValue == R"("foo")");
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto t2 = tkz.peek();
|
||||
REQUIRE(t2.has_value());
|
||||
REQUIRE(t2->value == lang::TokenV::opPlus);
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto t3 = tkz.peek();
|
||||
REQUIRE(t3.has_value());
|
||||
REQUIRE(t3->value == lang::TokenV::tkString);
|
||||
REQUIRE(t3->strValue == R"("bar")");
|
||||
REQUIRE(tkz.consume().has_value());
|
||||
|
||||
auto eof = tkz.peek();
|
||||
REQUIRE(eof.has_value());
|
||||
REQUIRE(eof->value == lang::TokenV::tkEOF);
|
||||
}
|
||||
|
||||
TEST_CASE("Strings_Unterminated_Error", "[strings][error]") {
|
||||
// Missing closing quote should yield an error.
|
||||
const std::string source = "\"unterminated";
|
||||
lang::Tokenizer tkz{ source };
|
||||
|
||||
auto errTok = tkz.peek();
|
||||
REQUIRE_FALSE(errTok.has_value());
|
||||
const auto &err = errTok.error();
|
||||
|
||||
REQUIRE(
|
||||
err.message.find("Invalid literal") != std::string::npos
|
||||
);
|
||||
}
|
||||
|
||||
TEST_CASE("Chars_Simple", "[chars][valid]") {
|
||||
constexpr std::array<std::string_view, 5> lexemes = {
|
||||
R"('a')",
|
||||
R"('Z')",
|
||||
R"('0')",
|
||||
R"('_')",
|
||||
R"('$')"
|
||||
};
|
||||
|
||||
CommonLiteralsSuccess(lang::TokenV::tkCharacter, lexemes);
|
||||
}
|
||||
|
||||
TEST_CASE("Chars_Escapes", "[chars][valid][escapes]") {
|
||||
constexpr std::array<std::string_view, 4> lexemes = {
|
||||
R"('\n')",
|
||||
R"('\t')",
|
||||
R"('\\')",
|
||||
R"('\'')"
|
||||
};
|
||||
|
||||
CommonLiteralsSuccess(lang::TokenV::tkCharacter, lexemes);
|
||||
}
|
||||
|
||||
TEST_CASE("Chars_Invalid_Empty", "[chars][error]") {
|
||||
const std::string source = "''";
|
||||
lang::Tokenizer tkz{ source };
|
||||
|
||||
auto errTok = tkz.peek();
|
||||
REQUIRE_FALSE(errTok.has_value());
|
||||
const auto &err = errTok.error();
|
||||
REQUIRE(
|
||||
err.message.find("Invalid literal") != std::string::npos
|
||||
);
|
||||
}
|
||||
|
||||
TEST_CASE("Chars_Invalid_Multiple", "[chars][error]") {
|
||||
const std::string source = "'ab'";
|
||||
lang::Tokenizer tkz{ source };
|
||||
|
||||
auto errTok = tkz.peek();
|
||||
REQUIRE_FALSE(errTok.has_value());
|
||||
const auto &err = errTok.error();
|
||||
REQUIRE(
|
||||
err.message.find("Invalid literal") != std::string::npos
|
||||
);
|
||||
}
|
||||
|
||||
TEST_CASE("Chars_Unterminated", "[chars][error]") {
|
||||
const std::string source = "'a";
|
||||
lang::Tokenizer tkz{ source };
|
||||
|
||||
auto errTok = tkz.peek();
|
||||
REQUIRE_FALSE(errTok.has_value());
|
||||
const auto &err = errTok.error();
|
||||
REQUIRE(
|
||||
err.message.find("Invalid literal") != std::string::npos
|
||||
);
|
||||
}
|
||||
|
||||
TEST_CASE("Chars_InvalidEscape", "[chars][error][.escapes]") {
|
||||
const std::string source = "'\\x'";
|
||||
lang::Tokenizer tkz{ source };
|
||||
|
||||
auto errTok = tkz.peek();
|
||||
REQUIRE_FALSE(errTok.has_value());
|
||||
const auto &err = errTok.error();
|
||||
REQUIRE(
|
||||
err.message.find("Invalid literal") != std::string::npos
|
||||
);
|
||||
}
|
||||
18
tests/cmake/dependencies.cmake
Normal file
18
tests/cmake/dependencies.cmake
Normal file
@ -0,0 +1,18 @@
|
||||
include(FetchContent)
|
||||
|
||||
# Get CPM
|
||||
file(
|
||||
DOWNLOAD
|
||||
https://github.com/cpm-cmake/CPM.cmake/releases/download/v0.40.8/CPM.cmake
|
||||
${CMAKE_CURRENT_BINARY_DIR}/cmake/CPM.cmake
|
||||
EXPECTED_HASH
|
||||
SHA256=78ba32abdf798bc616bab7c73aac32a17bbd7b06ad9e26a6add69de8f3ae4791
|
||||
)
|
||||
include(${CMAKE_CURRENT_BINARY_DIR}/cmake/CPM.cmake)
|
||||
|
||||
# Get dependencies
|
||||
|
||||
CPMAddPackage("gh:catchorg/Catch2@3.6.0")
|
||||
|
||||
# Include Catch2 CMake scripts
|
||||
list(APPEND CMAKE_MODULE_PATH ${Catch2_SOURCE_DIR}/extras)
|
||||
58
tests/include/Utils.hpp
Normal file
58
tests/include/Utils.hpp
Normal file
@ -0,0 +1,58 @@
|
||||
#pragma once
|
||||
|
||||
#include <random>
|
||||
|
||||
#include <artichoke/Coroutine/Generator.hpp>
|
||||
|
||||
template <std::ranges::range R1, std::ranges::range R2>
|
||||
requires(
|
||||
std::is_convertible_v<std::ranges::range_value_t<R1>, std::string_view> and
|
||||
std::is_same_v<std::ranges::range_value_t<R1>, std::ranges::range_value_t<R2>>
|
||||
)
|
||||
arti::lang::Generator<std::ranges::range_value_t<R1>>
|
||||
InterleaveRanges(R1 &&r1, R2 &&r2) {
|
||||
auto it1 = std::ranges::begin(r1);
|
||||
auto end1 = std::ranges::end(r1);
|
||||
|
||||
auto it2 = std::ranges::begin(r2);
|
||||
auto end2 = std::ranges::end(r2);
|
||||
|
||||
while (it1 != end1 && it2 != end2) {
|
||||
yield *it1;
|
||||
++it1;
|
||||
yield *it2;
|
||||
++it2;
|
||||
}
|
||||
}
|
||||
|
||||
static arti::lang::Generator<std::string_view>
|
||||
WhitespaceGenerator(uint32_t maxLen = 5) {
|
||||
constexpr std::array<char, 3> spaceChars{ ' ', '\t', '\n' };
|
||||
|
||||
std::string str;
|
||||
std::random_device device;
|
||||
std::mt19937 engine(device());
|
||||
std::uniform_int_distribution<uint32_t> dist(1, maxLen);
|
||||
std::uniform_int_distribution<uint32_t> distChars(0, 2);
|
||||
|
||||
str.reserve(maxLen);
|
||||
|
||||
while (true) {
|
||||
str.resize(0);
|
||||
|
||||
auto sz = dist(engine);
|
||||
|
||||
for (uint32_t i = 0; i < sz; ++i) {
|
||||
str += spaceChars[distChars(engine)];
|
||||
}
|
||||
|
||||
yield str;
|
||||
}
|
||||
}
|
||||
|
||||
template <std::ranges::range R>
|
||||
requires(std::is_same_v<std::ranges::range_value_t<R>, std::string_view>)
|
||||
static std::string SourceFromTokens(R &&tokens) {
|
||||
return InterleaveRanges(tokens, WhitespaceGenerator(10)) | std::views::join |
|
||||
std::ranges::to<std::string>();
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user