feat(test, tokenizer): Add test suite, in Tokenizer fixed catched issues and range-based API

Signed-off-by: erick-alcachofa <erick@artichoke.dev> This commit introduces a comprehensive test suite for the tokenizer using the Catch2 framework. To support this and improve the project structure, the build system and the tokenizer's API have been significantly updated. - Removed `cmake/testing.cmake` as it's no longer needed. - A new `TokenizerRange` class provides a C++20-style range interface, allowing for simple `for-each` loop iteration over tokens. This is used extensively in the new tests. - The CMake build system has been refactored: - An `ENABLE_TESTING` option (OFF by default) now controls whether the test suite is built. - The core library is now compiled into an object library, which is then used to produce both a shared (`.so`/`.dll`) and a static (`.a`/`.lib`) library. This improves build efficiency and provides more flexible linkage options. - The frontend executable now links against the static version of the library. - Implemented tests for tokenizer using Catch2 framework, covering various cases like identifiers, keywords, numbers, etc. that already catched some issues in current implementation. - Several parsing bugs and edge cases in the tokenizer were fixed, including the handling of unterminated strings and invalid numeric literals. The README has been updated with instructions for building and running tests.
2025-10-03 12:54:41 -06:00 · 2025-10-03 12:54:41 -06:00 · e1b9e054f3
commit e1b9e054f3
parent 0f8688d3ee
20 changed files with 1323 additions and 40 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -15,19 +15,14 @@ set(PROJECT_URL "lang.artichoke.dev")
 set(PROJECT_AUTHOR "erick-alcachofa")
 set(PROJECT_AUTHOR_GITHUB "@erick-alcachofa")

-include(cmake/testing.cmake)
+option(ENABLE_TESTING "Enable build of tests for library" OFF)

 add_subdirectory(lib)
 add_subdirectory(frontend)

 install(
-  TARGETS frontend library
-  EXPORT  artichokeTargets
-  FILE_SET HEADERS
-  LIBRARY  DESTINATION lib
-  ARCHIVE  DESTINATION lib
+  TARGETS frontend
  RUNTIME  DESTINATION bin
-  INCLUDES DESTINATION include
 )

 get_target_property(
@ -43,6 +38,16 @@ install(
  )"
 )

+install(
+  TARGETS library library_static
+  EXPORT  artichokeTargets
+  FILE_SET HEADERS
+  LIBRARY  DESTINATION lib
+  ARCHIVE  DESTINATION lib
+  RUNTIME  DESTINATION bin
+  INCLUDES DESTINATION include
+)
+
 install(
  EXPORT artichokeTargets
  FILE artichokeTargets.cmake
@ -62,3 +67,7 @@ install(
        "${CMAKE_CURRENT_BINARY_DIR}/artichokeConfigVersion.cmake"
  DESTINATION lib/cmake/artichoke
 )
+
+if(ENABLE_TESTING)
+  add_subdirectory(tests)
+endif()
--- a/README.md
+++ b/README.md
@ -46,6 +46,7 @@ grammar is stable, and the next step is the implementation of a compiler
 git clone https://git.artichoke.dev/me/artichoke-lang.git

 # Configure cmake
+# Optionally add -DENABLE_TESTING=ON for building tests
 cmake -DCMAKE_BUILD_TYPE=Release -S . -B build

 # Build the project
@ -54,6 +55,9 @@ cmake --build build
 # Run the binary
 ./build/frontend/artichoke-c

+# Run the tests if enabled
+ctest --test-dir build/tests --output-on-failure
+
 # Install if wanted
 cmake --install build --prefix=/usr/local

--- a/cmake/testing.cmake
+++ b/cmake/testing.cmake
--- a/frontend/CMakeLists.txt
+++ b/frontend/CMakeLists.txt
@ -34,5 +34,5 @@ target_include_directories(

 target_link_libraries(
  frontend PUBLIC
-    library
+    artichoke::library_static
 )
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@ -5,17 +5,17 @@ file(GLOB_RECURSE ARTI_LIB_HEADERS "include/**.hpp")
 file(GLOB_RECURSE ARTI_LIB_GEN_HEADERS "${CMAKE_CURRENT_BINARY_DIR}/include/**.hpp")

 add_library(
-  library SHARED
+  objs OBJECT
    ${ARTI_LIB_SOURCES}
 )

 set_target_properties(
-  library PROPERTIES
-  OUTPUT_NAME "artichoke"
+  objs PROPERTIES
+  POSITION_INDEPENDENT_CODE 1
 )

 target_compile_options(
-  library PRIVATE
+  objs PRIVATE
    -pedantic
    -Wall
    -Wextra
@ -30,24 +30,58 @@ target_compile_options(
    -Wno-unused
 )

-target_sources(
-  library PUBLIC
-  FILE_SET HEADERS
-  BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/include
-  FILES
-    ${ARTI_LIB_HEADERS}
-)
-
-target_sources(
-  library PUBLIC
-  FILE_SET HEADERS
-  BASE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/include
-  FILES
-    ${ARTI_LIB_GEN_HEADERS}
-)
-
 target_include_directories(
-  library PUBLIC
+  objs PUBLIC
    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
    $<INSTALL_INTERFACE:>
 )
+
+add_library(
+  library SHARED
+    $<TARGET_OBJECTS:objs>
+)
+
+add_library(
+  artichoke::library ALIAS
+    library
+)
+
+set_target_properties(
+  library PROPERTIES
+  OUTPUT_NAME "artichoke"
+)
+
+add_library(
+  library_static STATIC
+    $<TARGET_OBJECTS:objs>
+)
+
+add_library(
+  artichoke::library_static ALIAS
+    library_static
+)
+
+set_target_properties(
+  library_static PROPERTIES
+  OUTPUT_NAME "artichoke"
+)
+
+set(LIB_TARGETS objs library library_static)
+
+foreach(TGET IN LISTS LIB_TARGETS)
+  target_sources(
+    ${TGET} INTERFACE
+    FILE_SET HEADERS
+    BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/include
+    FILES
+      ${ARTI_LIB_HEADERS}
+  )
+
+  target_sources(
+    ${TGET} INTERFACE
+    FILE_SET HEADERS
+    BASE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/include
+    FILES
+      ${ARTI_LIB_GEN_HEADERS}
+  )
+endforeach()
--- a/lib/include/artichoke/Tokenizer/Tokenizer.hpp
+++ b/lib/include/artichoke/Tokenizer/Tokenizer.hpp
@ -1,12 +1,12 @@
 #pragma once

 #include <deque>
-#include <vector>

 #include <artichoke/Util/Expected.hpp>
 #include <artichoke/Coroutine/Generator.hpp>

 #include <artichoke/Tokenizer/Token.hpp>
+#include <artichoke/Tokenizer/TokenizerRange.hpp>

 namespace arti::lang {

@ -30,6 +30,8 @@ namespace arti::lang {

    void swap(Tokenizer &other) noexcept;

+    TokenizerRange range() noexcept;
+
   private:
    Generator<Expected<Token>> tokenize();

--- a/lib/include/artichoke/Tokenizer/TokenizerRange.hpp
+++ b/lib/include/artichoke/Tokenizer/TokenizerRange.hpp
@ -0,0 +1,71 @@
+#pragma once
+
+#include <artichoke/Util/Expected.hpp>
+#include <artichoke/Tokenizer/Token.hpp>
+
+namespace arti::lang {
+  struct Tokenizer;
+
+  struct [[nodiscard]] TokenizerRange {
+    friend struct Tokenizer;
+    struct Iterator;
+    struct Sentinel;
+
+    using iterator_type = Iterator;
+    using sentinel_type = Sentinel;
+
+    TokenizerRange(TokenizerRange &&) noexcept;
+    TokenizerRange &operator=(TokenizerRange &&) noexcept;
+
+    TokenizerRange(const TokenizerRange &) noexcept = delete;
+    TokenizerRange &operator=(const TokenizerRange &) noexcept = delete;
+
+    Iterator begin();
+    Sentinel end() const noexcept;
+
+    struct Iterator {
+      friend struct TokenizerRange;
+      using iterator_category = std::input_iterator_tag;
+      using difference_type = std::ptrdiff_t;
+
+      using ValueType = Expected<Token>;
+      using ReferenceType = ValueType &;
+      using PointerType = ValueType *;
+
+      using value_type = ValueType;
+      using pointer_type = PointerType;
+
+      Iterator(Iterator &&) noexcept;
+      Iterator &operator=(Iterator &&) noexcept;
+
+      Iterator(const Iterator &) noexcept = delete;
+      Iterator &operator=(const Iterator &) noexcept = delete;
+
+      Iterator &operator++();
+
+      void operator++(int);
+
+      ReferenceType operator*() const noexcept;
+      PointerType operator->() const noexcept;
+
+      friend bool operator==(const Iterator &, Sentinel);
+      friend bool operator==(Sentinel, const Iterator &);
+
+      friend bool operator!=(const Iterator &, Sentinel);
+      friend bool operator!=(Sentinel, const Iterator &);
+
+     private:
+      Iterator(Tokenizer *tokenizer) noexcept;
+
+      Tokenizer *tokenizer;
+      mutable Expected<Token> cvalue;
+    };
+
+    struct Sentinel {};
+
+   private:
+    TokenizerRange(Tokenizer *tokenizer);
+
+    Tokenizer *tokenizer;
+  };
+}
--- a/lib/src/Tokenizer/Tokenizer.cpp
+++ b/lib/src/Tokenizer/Tokenizer.cpp
@ -1,8 +1,8 @@
 #include <artichoke/Tokenizer/Tokenizer.hpp>

-#include <print>
 #include <utility>

+#include <artichoke/Tokenizer/TokenizerRange.hpp>
 #include <artichoke/Util/Strings.hpp>
 #include <artichoke/Util/Demangle.hpp>
 #include <artichoke/Util/TrieMap.hpp>
@ -36,8 +36,22 @@ namespace arti::lang {
    return *this;
  }

+  TokenizerRange Tokenizer::range() noexcept {
+    return TokenizerRange{ this };
+  }
+
  Expected<void> Tokenizer::consume(std::size_t n) noexcept {
-    while (not tokensBuffer.empty()) {
+    while (n > 0) {
+      if (tokensBuffer.empty()) {
+        if (auto ok = peek(); ! ok) {
+          return Unexpected<>{ ok.error() };
+        }
+
+        if (finished()) {
+          break;
+        }
+      }
+
      tokensBuffer.pop_front();
      n -= 1;
    }
@ -94,7 +108,11 @@ namespace arti::lang {

  bool Tokenizer::finished() const noexcept {
    if (tokensGenerator.finished()) {
-      return tokensBuffer.empty();
+      if (!tokensBuffer.empty()) {
+        return tokensBuffer.front().value == TokenV::tkEOF;
+      }
+
+      return true;
    }

    return false;
@ -340,6 +358,7 @@ namespace arti::lang {
    }

    if (*iter == '.') {
+      auto dotIter = iter;
      forward();

      while (iter != source.end()) {
@ -350,6 +369,21 @@ namespace arti::lang {
        forward();
      }

+      if ((iter - dotIter) == 1) {
+        /* Revert to dot */
+        --iter;
+        --column;
+
+        return langException<ExceptCode::ecInvalidLiteral>(
+          line,
+          column,
+          "digit",
+          iter == source.end()
+            ? "EOF"
+            : std::string{ *(iter + 1) }
+        );
+      }
+
      return Token{
        TokenV::tkDecimal,
        cLine,
@ -390,7 +424,6 @@ namespace arti::lang {
      }

      if (*iter == '"') {
-        forward();
        break;
      }

@ -412,12 +445,23 @@ namespace arti::lang {
      forward();
    }

-    return Token{
-      TokenV::tkString,
-      cLine,
-      cColumn,
-      { stIter, iter }
-    };
+    if (*iter == '"') {
+      forward();
+
+      return Token{
+        TokenV::tkString,
+        cLine,
+        cColumn,
+        { stIter, iter }
+      };
+    }
+
+    return langException<ExceptCode::ecInvalidLiteral>(
+      line,
+      column,
+      "end of string (\")",
+      "EOF"
+    );
  }

  Expected<Token> Tokenizer::readCharacter() {
--- a/lib/src/Tokenizer/TokenizerRange.cpp
+++ b/lib/src/Tokenizer/TokenizerRange.cpp
@ -0,0 +1,84 @@
+#include <artichoke/Tokenizer/TokenizerRange.hpp>
+
+#include <utility>
+
+#include <artichoke/Tokenizer/Tokenizer.hpp>
+
+namespace arti::lang {
+
+  using Iterator = TokenizerRange::Iterator;
+  using Sentinel = TokenizerRange::Sentinel;
+
+  TokenizerRange::TokenizerRange(Tokenizer *tokenizer)
+    : tokenizer(tokenizer) { }
+
+  TokenizerRange::TokenizerRange(TokenizerRange &&other) noexcept
+    : tokenizer(std::exchange(other.tokenizer, nullptr)) { }
+
+  TokenizerRange &TokenizerRange::operator=(TokenizerRange &&other) noexcept {
+    this->tokenizer = std::exchange(other.tokenizer, nullptr);
+    return *this;
+  }
+
+  Iterator TokenizerRange::begin() {
+    return Iterator{ this->tokenizer };
+  }
+
+  Sentinel TokenizerRange::end() const noexcept {
+    return Sentinel{};
+  }
+
+  Iterator::Iterator(Tokenizer *tokenizer) noexcept
+    : tokenizer(tokenizer)
+    , cvalue(tokenizer->peek()) { }
+
+  Iterator::Iterator(Iterator &&other) noexcept
+    : tokenizer(std::exchange(other.tokenizer, nullptr))
+    , cvalue(std::exchange(other.cvalue, {})) { }
+
+  Iterator &Iterator::operator=(Iterator &&other) noexcept {
+    this->tokenizer = std::exchange(other.tokenizer, nullptr);
+    this->cvalue = std::exchange(other.cvalue, {});
+    return *this;
+  }
+
+  Iterator &Iterator::operator++() {
+    if (this->cvalue) {
+      std::ignore = tokenizer->consume();
+    }
+    this->cvalue = tokenizer->peek();
+    return *this;
+  }
+
+  void Iterator::operator++(int) {
+    std::ignore = this->operator++();
+  }
+
+  Iterator::ReferenceType Iterator::operator*() const noexcept {
+    return this->cvalue;
+  }
+
+  Iterator::PointerType Iterator::operator->() const noexcept {
+    return &this->cvalue;
+  }
+
+  bool operator==(const Iterator &it, Sentinel) {
+    if (it.tokenizer->finished()) {
+      return true;
+    }
+    return !it.cvalue.has_value() || it.cvalue->value == TokenV::tkEOF;
+  }
+
+  bool operator==(Sentinel, const Iterator &it) {
+    return it == Sentinel{};
+  }
+
+  bool operator!=(const Iterator &it, Sentinel) {
+    return !(it == Sentinel{});
+  }
+
+  bool operator!=(Sentinel, const Iterator &it) {
+    return !(it == Sentinel{});
+  }
+
+} // namespace arti::lang
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -0,0 +1,20 @@
+include(cmake/dependencies.cmake)
+
+enable_testing()
+
+add_library(test INTERFACE)
+
+target_include_directories(
+  test INTERFACE
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+)
+
+target_link_libraries(
+  test INTERFACE
+    artichoke::library
+    Catch2::Catch2WithMain
+)
+
+include(Catch)
+
+add_subdirectory(Tokenizer)
--- a/tests/Tokenizer/CMakeLists.txt
+++ b/tests/Tokenizer/CMakeLists.txt
@ -0,0 +1,16 @@
+file(GLOB_RECURSE TOKENIZER_TEST_SRC "src/**.cpp")
+
+add_executable(
+  test-tokenizer
+    ${TOKENIZER_TEST_SRC}
+)
+
+target_link_libraries(
+  test-tokenizer PRIVATE
+    test
+)
+
+catch_discover_tests(
+  test-tokenizer
+  TEST_PREFIX "Tokenizer."
+)
--- a/tests/Tokenizer/src/Api.cpp
+++ b/tests/Tokenizer/src/Api.cpp
@ -0,0 +1,99 @@
+#include <catch2/catch_test_macros.hpp>
+
+#include <string>
+#include <vector>
+
+#include <artichoke/Tokenizer/Tokenizer.hpp>
+
+namespace lang = arti::lang;
+
+TEST_CASE("API_PeekOffset", "[api][peek]") {
+  const std::string source = "a b c";
+  lang::Tokenizer tkz{ source };
+
+  auto t0 = tkz.peek(0);
+  REQUIRE(t0.has_value());
+  REQUIRE(t0->value == lang::TokenV::tkIdentifier);
+  REQUIRE(t0->strValue == "a");
+
+  auto t1 = tkz.peek(1);
+  REQUIRE(t1.has_value());
+  REQUIRE(t1->value == lang::TokenV::tkIdentifier);
+  REQUIRE(t1->strValue == "b");
+
+  auto t2 = tkz.peek(2);
+  REQUIRE(t2.has_value());
+  REQUIRE(t2->value == lang::TokenV::tkIdentifier);
+  REQUIRE(t2->strValue == "c");
+
+  auto t3 = tkz.peek(3);
+  REQUIRE(t3.has_value());
+  REQUIRE(t3->value == lang::TokenV::tkEOF);
+
+  // Ensure nothing was consumed by peeks
+  REQUIRE_FALSE(tkz.finished());
+  auto t0_again = tkz.peek();
+  REQUIRE(t0_again.has_value());
+  REQUIRE(t0_again->value == lang::TokenV::tkIdentifier);
+  REQUIRE(t0_again->strValue == "a");
+}
+
+TEST_CASE("API_ConsumeAndFinishedSemantics", "[api][consume][finished]") {
+  const std::string source = "x y z";
+  lang::Tokenizer tkz{ source };
+
+  // consume 2 tokens: x, y
+  REQUIRE(tkz.consume(2).has_value());
+  REQUIRE_FALSE(tkz.finished());
+
+  auto t = tkz.peek();
+  REQUIRE(t.has_value());
+  REQUIRE(t->value == lang::TokenV::tkIdentifier);
+  REQUIRE(t->strValue == "z");
+
+  // consume last token
+  REQUIRE(tkz.consume().has_value());
+  // finished() returns true only after EOF token has been produced
+  REQUIRE_FALSE(tkz.finished());
+
+  auto eof = tkz.peek();
+  REQUIRE(eof.has_value());
+  REQUIRE(eof->value == lang::TokenV::tkEOF);
+  REQUIRE(tkz.finished());
+}
+
+TEST_CASE("API_RangeIterationStopsAtEOF", "[api][range]") {
+  const std::string source =
+    "let ident := 42 /* skip this */ + 1";
+  lang::Tokenizer tkz{ source };
+
+  std::vector<lang::TokenV> kinds;
+  std::vector<std::string_view> lex;
+
+  for (auto token : tkz.range()) {
+    REQUIRE(token.has_value());
+    kinds.push_back(token->value);
+    lex.push_back(token->strValue);
+  }
+
+  // Expected sequence: kwLet, tkIdentifier("ident"), opLabel, tkInteger("42"), opPlus, tkInteger("1")
+  REQUIRE(kinds.size() == 6);
+  REQUIRE(kinds[0] == lang::TokenV::kwLet);
+  REQUIRE(kinds[1] == lang::TokenV::tkIdentifier);
+  REQUIRE(kinds[2] == lang::TokenV::opLabel);
+  REQUIRE(kinds[3] == lang::TokenV::tkInteger);
+  REQUIRE(kinds[4] == lang::TokenV::opPlus);
+  REQUIRE(kinds[5] == lang::TokenV::tkInteger);
+
+  REQUIRE(lex[0] == "let");
+  REQUIRE(lex[1] == "ident");
+  REQUIRE(lex[2] == ":=");
+  REQUIRE(lex[3] == "42");
+  REQUIRE(lex[4] == "+");
+  REQUIRE(lex[5] == "1");
+
+  // After iterating the range, peek should yield EOF
+  auto eof = tkz.peek();
+  REQUIRE(eof.has_value());
+  REQUIRE(eof->value == lang::TokenV::tkEOF);
+}
--- a/tests/Tokenizer/src/Comments.cpp
+++ b/tests/Tokenizer/src/Comments.cpp
@ -0,0 +1,87 @@
+#include <catch2/catch_test_macros.hpp>
+
+#include <string>
+
+#include <artichoke/Tokenizer/Tokenizer.hpp>
+#include <artichoke/Util/Expected.hpp>
+
+namespace lang = arti::lang;
+
+TEST_CASE("Comments_BlockSkipped", "[comments][block][skip]") {
+  // Ensure that block comments are ignored and do not emit tokens.
+  const std::string source = "foo /* a block comment with symbols 123 !@# */ bar";
+
+  lang::Tokenizer tkz{source};
+
+  auto t1 = tkz.peek();
+  REQUIRE(t1.has_value());
+  REQUIRE(t1->value == lang::TokenV::tkIdentifier);
+  REQUIRE(t1->strValue == "foo");
+
+  REQUIRE(tkz.consume().has_value());
+
+  auto t2 = tkz.peek();
+  REQUIRE(t2.has_value());
+  REQUIRE(t2->value == lang::TokenV::tkIdentifier);
+  REQUIRE(t2->strValue == "bar");
+
+  REQUIRE(tkz.consume().has_value());
+
+  auto eof = tkz.peek();
+  REQUIRE(eof.has_value());
+  REQUIRE(eof->value == lang::TokenV::tkEOF);
+}
+
+TEST_CASE("Comments_BlockMultiline", "[comments][block][multiline]") {
+  const std::string source =
+      "alpha /* line1\n"
+      "line2\n"
+      "line3 */ beta";
+
+  lang::Tokenizer tkz{source};
+
+  auto t1 = tkz.peek();
+  REQUIRE(t1.has_value());
+  REQUIRE(t1->value == lang::TokenV::tkIdentifier);
+  REQUIRE(t1->strValue == "alpha");
+  REQUIRE(tkz.consume().has_value());
+
+  auto t2 = tkz.peek();
+  REQUIRE(t2.has_value());
+  REQUIRE(t2->value == lang::TokenV::tkIdentifier);
+  REQUIRE(t2->strValue == "beta");
+  REQUIRE(tkz.consume().has_value());
+
+  auto eof = tkz.peek();
+  REQUIRE(eof.has_value());
+  REQUIRE(eof->value == lang::TokenV::tkEOF);
+}
+
+TEST_CASE("Comments_UnterminatedBlock_Error", "[comments][block][error]") {
+  // Unterminated block comments should surface an error from the tokenizer.
+  const std::string source = "foo /* this never ends...";
+
+  lang::Tokenizer tkz{source};
+
+  auto t1 = tkz.peek();
+  REQUIRE(t1.has_value());
+  REQUIRE(t1->value == lang::TokenV::tkIdentifier);
+  REQUIRE(t1->strValue == "foo");
+  REQUIRE(tkz.consume().has_value());
+
+  auto errTok = tkz.peek();
+  REQUIRE_FALSE(errTok.has_value());
+
+  // Check error message mentions invalid comment.
+  const auto &err = errTok.error();
+  REQUIRE(err.message.find("Invalid comment") != std::string::npos);
+}
+
+TEST_CASE("Comments_SingleLineUnsupported_Skip", "[comments][.line]") {
+  // The tokenizer currently does NOT support '//' comments.
+  // Keep this test as a placeholder and mark it skipped to avoid failures.
+  SKIP("Single-line '//' comments are not supported yet by the tokenizer");
+  const std::string source = "foo // comment\n bar";
+  lang::Tokenizer tkz{source};
+  (void)tkz; // silence unused
+}
--- a/tests/Tokenizer/src/Identifiers.cpp
+++ b/tests/Tokenizer/src/Identifiers.cpp
@ -0,0 +1,127 @@
+#include <catch2/catch_test_macros.hpp>
+
+#include <array>
+#include <string>
+#include <vector>
+
+#include <artichoke/Tokenizer/Tokenizer.hpp>
+#include <Utils.hpp>
+
+namespace lang = arti::lang;
+
+template <std::size_t N>
+static void CommonIdentifiersSuccess(
+  const std::array<std::string_view, N> &ids
+) {
+  const std::string source = SourceFromTokens(ids);
+
+  std::size_t it = 0;
+  lang::Tokenizer tkz{ source };
+
+  for (auto token : tkz.range()) {
+    REQUIRE(token.has_value());
+    REQUIRE(token->value == lang::TokenV::tkIdentifier);
+    REQUIRE(token->strValue == ids.at(it++));
+  }
+
+  REQUIRE(it == ids.size());
+  REQUIRE(tkz.peek().has_value());
+  REQUIRE(tkz.peek()->value == lang::TokenV::tkEOF);
+}
+
+TEST_CASE("Identifiers_Basic", "[identifiers][valid]") {
+  constexpr std::array<std::string_view, 8> ids = {
+    "a", "abc", "a_b", "snake_case", "camelCase", "PascalCase", "_id", "with123"
+  };
+
+  CommonIdentifiersSuccess(ids);
+}
+
+TEST_CASE("Identifiers_DigitsAfterFirst", "[identifiers][valid]") {
+  constexpr std::array<std::string_view, 6> ids = {
+    "a1", "abc123", "_a1_b2", "v2", "x9y8z7", "i18n"
+  };
+
+  CommonIdentifiersSuccess(ids);
+}
+
+TEST_CASE("Identifiers_Long", "[identifiers][valid][long]") {
+  // Create a long identifier to ensure tokenizer handles large spans.
+  std::string longId(512, 'a');
+  std::vector<std::string_view> toks{ longId };
+
+  const std::string source = SourceFromTokens(toks);
+  lang::Tokenizer tkz{ source };
+
+  auto t = tkz.peek();
+  REQUIRE(t.has_value());
+  REQUIRE(t->value == lang::TokenV::tkIdentifier);
+  REQUIRE(t->strValue == longId);
+
+  REQUIRE(tkz.consume().has_value());
+  auto eof = tkz.peek();
+  REQUIRE(eof.has_value());
+  REQUIRE(eof->value == lang::TokenV::tkEOF);
+}
+
+TEST_CASE("Identifiers_WithOperators", "[identifiers][operators]") {
+  // '$' and '?' are operators, not identifier characters.
+  // '$foo' -> opMut, tkIdentifier("foo")
+  // '?bar' -> opOpt, tkIdentifier("bar")
+  const std::string source = "$foo ?bar";
+
+  lang::Tokenizer tkz{ source };
+
+  auto t1 = tkz.peek();
+  REQUIRE(t1.has_value());
+  REQUIRE(t1->value == lang::TokenV::opMut);
+  REQUIRE(tkz.consume().has_value());
+
+  auto t2 = tkz.peek();
+  REQUIRE(t2.has_value());
+  REQUIRE(t2->value == lang::TokenV::tkIdentifier);
+  REQUIRE(t2->strValue == "foo");
+  REQUIRE(tkz.consume().has_value());
+
+  auto t3 = tkz.peek();
+  REQUIRE(t3.has_value());
+  REQUIRE(t3->value == lang::TokenV::opOpt);
+  REQUIRE(tkz.consume().has_value());
+
+  auto t4 = tkz.peek();
+  REQUIRE(t4.has_value());
+  REQUIRE(t4->value == lang::TokenV::tkIdentifier);
+  REQUIRE(t4->strValue == "bar");
+  REQUIRE(tkz.consume().has_value());
+
+  auto eof = tkz.peek();
+  REQUIRE(eof.has_value());
+  REQUIRE(eof->value == lang::TokenV::tkEOF);
+}
+
+TEST_CASE("Identifiers_DotAccess", "[identifiers][dot]") {
+  const std::string source = "foo.bar";
+
+  lang::Tokenizer tkz{ source };
+
+  auto t1 = tkz.peek();
+  REQUIRE(t1.has_value());
+  REQUIRE(t1->value == lang::TokenV::tkIdentifier);
+  REQUIRE(t1->strValue == "foo");
+  REQUIRE(tkz.consume().has_value());
+
+  auto t2 = tkz.peek();
+  REQUIRE(t2.has_value());
+  REQUIRE(t2->value == lang::TokenV::opDot);
+  REQUIRE(tkz.consume().has_value());
+
+  auto t3 = tkz.peek();
+  REQUIRE(t3.has_value());
+  REQUIRE(t3->value == lang::TokenV::tkIdentifier);
+  REQUIRE(t3->strValue == "bar");
+  REQUIRE(tkz.consume().has_value());
+
+  auto eof = tkz.peek();
+  REQUIRE(eof.has_value());
+  REQUIRE(eof->value == lang::TokenV::tkEOF);
+}
--- a/tests/Tokenizer/src/Keywords.cpp
+++ b/tests/Tokenizer/src/Keywords.cpp
@ -0,0 +1,93 @@
+#include <catch2/catch_test_macros.hpp>
+
+#include <array>
+
+#include <artichoke/Tokenizer/Tokenizer.hpp>
+
+#include <Utils.hpp>
+
+namespace lang = arti::lang;
+
+template <std::size_t N>
+static void CommonKeywordsSuccess(
+  const std::array<std::string_view, N> &lexemes,
+  const std::array<lang::TokenV, N> &kinds
+) {
+  static_assert(N > 0, "Must provide at least one keyword");
+  const std::string source = SourceFromTokens(lexemes);
+
+  std::size_t it = 0;
+  lang::Tokenizer tkz{ source };
+
+  for (auto token : tkz.range()) {
+    REQUIRE(token.has_value());
+    REQUIRE(token->value == kinds.at(it));
+    REQUIRE(token->strValue == lexemes.at(it));
+    ++it;
+  }
+
+  REQUIRE(it == lexemes.size());
+  REQUIRE(tkz.peek().has_value());
+  REQUIRE(tkz.peek()->value == lang::TokenV::tkEOF);
+}
+
+TEST_CASE("Keywords_AllRecognized", "[keywords][valid]") {
+  constexpr std::array<std::string_view, 31> lexemes = {
+    "_","or","not","and","if","else","fn","enum","struct","def","let","for",
+    "loop","break","continue","while","match","switch","return","unreachable",
+    "defer","errdefer","true","false","null","this","import","export","module","using","this"
+  };
+
+  constexpr std::array<lang::TokenV, 31> kinds = {
+    lang::TokenV::kwUnderscore,
+    lang::TokenV::kwOr,
+    lang::TokenV::kwNot,
+    lang::TokenV::kwAnd,
+    lang::TokenV::kwIf,
+    lang::TokenV::kwElse,
+    lang::TokenV::kwFn,
+    lang::TokenV::kwEnum,
+    lang::TokenV::kwStruct,
+    lang::TokenV::kwDef,
+    lang::TokenV::kwLet,
+    lang::TokenV::kwFor,
+    lang::TokenV::kwLoop,
+    lang::TokenV::kwBreak,
+    lang::TokenV::kwContinue,
+    lang::TokenV::kwWhile,
+    lang::TokenV::kwMatch,
+    lang::TokenV::kwSwitch,
+    lang::TokenV::kwReturn,
+    lang::TokenV::kwUnreachable,
+    lang::TokenV::kwDefer,
+    lang::TokenV::kwErrDefer,
+    lang::TokenV::kwTrue,
+    lang::TokenV::kwFalse,
+    lang::TokenV::kwNull,
+    lang::TokenV::kwThis,
+    lang::TokenV::kwImport,
+    lang::TokenV::kwExport,
+    lang::TokenV::kwModule,
+    lang::TokenV::kwUsing,
+    lang::TokenV::kwThis,
+  };
+
+  CommonKeywordsSuccess(lexemes, kinds);
+}
+
+TEST_CASE("Keywords_PrecedenceOverIdentifiers", "[keywords][precedence]") {
+  // Ensure that keywords are recognized as keywords, while longer names remain identifiers.
+  constexpr std::array<std::string_view, 6> lexemes = {
+    "if", "iff", "return", "returnX", "_", "_id"
+  };
+  constexpr std::array<lang::TokenV, 6> kinds = {
+    lang::TokenV::kwIf,        // "if" is a keyword
+    lang::TokenV::tkIdentifier,// "iff" should be an identifier
+    lang::TokenV::kwReturn,    // "return" is a keyword
+    lang::TokenV::tkIdentifier,// "returnX" is not a keyword
+    lang::TokenV::kwUnderscore,// "_" is a keyword in this language
+    lang::TokenV::tkIdentifier // "_id" is a regular identifier
+  };
+
+  CommonKeywordsSuccess(lexemes, kinds);
+}
--- a/tests/Tokenizer/src/Numbers.cpp
+++ b/tests/Tokenizer/src/Numbers.cpp
@ -0,0 +1,171 @@
+#include <catch2/catch_test_macros.hpp>
+
+#include <array>
+
+#include <artichoke/Tokenizer/Tokenizer.hpp>
+
+#include <Utils.hpp>
+
+namespace lang = arti::lang;
+
+template <std::size_t N>
+static void CommonIntegersSuccess(
+  lang::TokenV type,
+  const std::array<std::string_view, N> &expected
+) {
+  const std::string source = SourceFromTokens(expected);
+
+  std::size_t it = 0;
+  lang::Tokenizer tkz{ source };
+
+  for (auto token : tkz.range()) {
+    REQUIRE(token.has_value());
+    REQUIRE(token->value == type);
+    REQUIRE(token->strValue == expected.at(it++));
+  }
+
+  REQUIRE(it == expected.size());
+  REQUIRE(tkz.peek().has_value());
+  REQUIRE(tkz.peek()->value == lang::TokenV::tkEOF);
+}
+
+TEST_CASE("Numbers_Integers", "[integers][valid]") {
+  constexpr std::array<std::string_view, 6> expected = {
+    "0", "1", "42", "123456", "98712390", "12381723912465471"
+  };
+
+  CommonIntegersSuccess(lang::TokenV::tkInteger, expected);
+}
+
+TEST_CASE("Numbers_HexIntegers", "[integers][valid]") {
+  constexpr std::array<std::string_view, 6> expected = {
+    "0x831",       "0xAFEFE",    "0xABEBE",
+    "0x7a147e8a3", "0x98712390", "0x1d238c18e7ff239a12465471"
+  };
+
+  CommonIntegersSuccess(lang::TokenV::tkInteger, expected);
+}
+
+TEST_CASE("Numbers_OctIntegers", "[integers][valid]") {
+  constexpr std::array<std::string_view, 6> expected = {
+    "041", "064123", "0136237", "012345", "01", "071236571236512631723651"
+  };
+
+  CommonIntegersSuccess(lang::TokenV::tkInteger, expected);
+}
+
+TEST_CASE("Numbers_BinIntegers", "[integers][valid]") {
+  constexpr std::array<std::string_view, 6> expected = {
+    "0b0101101",          "0b1", "0b01", "0b0", "0b011010101110101101110101011",
+    "0b11110101011010101"
+  };
+
+  CommonIntegersSuccess(lang::TokenV::tkInteger, expected);
+}
+
+TEST_CASE("Numbers_Decimal", "[decimals][valid]") {
+  constexpr std::array<std::string_view, 6> expected = {
+    "1.0", "0.5", "3.14159", "10.50", "9999.0001", "1375123476175981.813751235"
+  };
+
+  CommonIntegersSuccess(lang::TokenV::tkDecimal, expected);
+}
+
+TEST_CASE("Numbers_UnaryMinusSeparate", "[numbers][unary-minus]") {
+  const std::string source = "-1 -2.5";
+  lang::Tokenizer tkz{ source };
+
+  auto t1 = tkz.peek();
+  REQUIRE(t1.has_value());
+  REQUIRE(t1->value == lang::TokenV::opHyphen);
+  REQUIRE(tkz.consume().has_value());
+
+  auto t2 = tkz.peek();
+  REQUIRE(t2.has_value());
+  REQUIRE(t2->value == lang::TokenV::tkInteger);
+  REQUIRE(t2->strValue == "1");
+  REQUIRE(tkz.consume().has_value());
+
+  auto t3 = tkz.peek();
+  REQUIRE(t3.has_value());
+  REQUIRE(t3->value == lang::TokenV::opHyphen);
+  REQUIRE(tkz.consume().has_value());
+
+  auto t4 = tkz.peek();
+  REQUIRE(t4.has_value());
+  REQUIRE(t4->value == lang::TokenV::tkDecimal);
+  REQUIRE(t4->strValue == "2.5");
+  REQUIRE(tkz.consume().has_value());
+
+  auto eof = tkz.peek();
+  REQUIRE(eof.has_value());
+  REQUIRE(eof->value == lang::TokenV::tkEOF);
+}
+
+TEST_CASE("Numbers_DotBoundaries_Disambiguation", "[numbers][dot][edge]") {
+  // Expect numbers must start with a digit:
+  // .5   -> '.' + '5'
+  // 10.  -> ERROR (expects digit after '.'), then '.' token if continued
+  // 1..2 -> ERROR (expects digit after '.'), then '.' '.' '2' if continued
+  const std::string source = ".5 10. 1..2";
+  lang::Tokenizer tkz{ source };
+
+  // .5 -> '.' then '5'
+  auto a1 = tkz.peek();
+  REQUIRE(a1.has_value());
+  REQUIRE(a1->value == lang::TokenV::opDot);
+  REQUIRE(tkz.consume().has_value());
+
+  auto a2 = tkz.peek();
+  REQUIRE(a2.has_value());
+  REQUIRE(a2->value == lang::TokenV::tkInteger);
+  REQUIRE(a2->strValue == "5");
+  REQUIRE(tkz.consume().has_value());
+
+  // 10. -> first an error (expects a digit after '.'), then '.' is parsed if we continue
+  auto err1 = tkz.peek();
+  REQUIRE_FALSE(err1.has_value());
+
+  auto after_err1 = tkz.peek();
+  REQUIRE(after_err1.has_value());
+  REQUIRE(after_err1->value == lang::TokenV::opDot);
+  REQUIRE(tkz.consume().has_value());
+
+  // 1..2 -> first an error (expects a digit after '.'), then '.' '.' '2'
+  auto err2 = tkz.peek();
+  REQUIRE_FALSE(err2.has_value());
+
+  auto dot1 = tkz.peek();
+  REQUIRE(dot1.has_value());
+  REQUIRE(dot1->value == lang::TokenV::opDot);
+  REQUIRE(tkz.consume().has_value());
+
+  auto dot2 = tkz.peek();
+  REQUIRE(dot2.has_value());
+  REQUIRE(dot2->value == lang::TokenV::opDot);
+  REQUIRE(tkz.consume().has_value());
+
+  auto last = tkz.peek();
+  REQUIRE(last.has_value());
+  REQUIRE(last->value == lang::TokenV::tkInteger);
+  REQUIRE(last->strValue == "2");
+  REQUIRE(tkz.consume().has_value());
+
+  auto eof = tkz.peek();
+  REQUIRE(eof.has_value());
+  REQUIRE(eof->value == lang::TokenV::tkEOF);
+}
+
+TEST_CASE("Numbers_InvalidPrefixes", "[numbers][invalid]") {
+  const std::array<const char*, 5> invalids = { "0x", "0b", "0xG", "0b2", "08" };
+
+  for (auto src : invalids) {
+    lang::Tokenizer tkz{ std::string{src} };
+    auto tok = tkz.peek();
+    REQUIRE_FALSE(tok.has_value());
+    const auto &err = tok.error();
+    REQUIRE(
+      err.message.find("Invalid literal") != std::string::npos
+    );
+  }
+}
--- a/tests/Tokenizer/src/Operators.cpp
+++ b/tests/Tokenizer/src/Operators.cpp
@ -0,0 +1,178 @@
+#include <catch2/catch_test_macros.hpp>
+
+#include <array>
+#include <string>
+#include <vector>
+
+#include <artichoke/Tokenizer/Tokenizer.hpp>
+#include <Utils.hpp>
+
+namespace lang = arti::lang;
+
+template <std::size_t N>
+static void CommonOpsSuccess(
+  const std::array<std::string_view, N> &lexemes,
+  const std::array<lang::TokenV, N> &kinds
+) {
+  static_assert(N > 0, "Must provide at least one operator");
+  const std::string source = SourceFromTokens(lexemes);
+
+  std::size_t it = 0;
+  lang::Tokenizer tkz{ source };
+
+  for (auto token : tkz.range()) {
+    REQUIRE(token.has_value());
+    REQUIRE(token->value == kinds.at(it));
+    REQUIRE(token->strValue == lexemes.at(it));
+    ++it;
+  }
+
+  REQUIRE(it == lexemes.size());
+  REQUIRE(tkz.peek().has_value());
+  REQUIRE(tkz.peek()->value == lang::TokenV::tkEOF);
+}
+
+TEST_CASE("Operators_SingleChar", "[operators][single]") {
+  constexpr std::array<std::string_view, 25> lex = {
+    ".", "%", "+", "-", "/", "!", "*", ":", ",", "=", ";", "^", "~",
+    "&", "|", "<", ">", "(", ")", "[", "]", "{", "}", "?", "$"
+  };
+  constexpr std::array<lang::TokenV, 25> kinds = {
+    lang::TokenV::opDot, lang::TokenV::opMod, lang::TokenV::opPlus, lang::TokenV::opHyphen,
+    lang::TokenV::opSlash, lang::TokenV::opBang, lang::TokenV::opStar, lang::TokenV::opColon,
+    lang::TokenV::opComma, lang::TokenV::opAssign, lang::TokenV::opSemicolon, lang::TokenV::opCaret,
+    lang::TokenV::opTilde, lang::TokenV::opAnd, lang::TokenV::opOr, lang::TokenV::opLt,
+    lang::TokenV::opGt, lang::TokenV::opLParen, lang::TokenV::opRParen, lang::TokenV::opLBracket,
+    lang::TokenV::opRBracket, lang::TokenV::opLSquirly, lang::TokenV::opRSquirly, lang::TokenV::opOpt,
+    lang::TokenV::opMut
+  };
+
+  CommonOpsSuccess(lex, kinds);
+}
+
+TEST_CASE("Operators_MultiChar", "[operators][multi]") {
+  constexpr std::array<std::string_view, 22> lex = {
+    "==","!=", "<=", ">=", "<<", ">>", "&&", "||",
+    "+=", "-=", "*=", "/=", "%=", "&=", "|=",
+    "<<=", ">>=", "&&=", "||=", "->", "::", ":="
+  };
+  constexpr std::array<lang::TokenV, 22> kinds = {
+    lang::TokenV::opEq, lang::TokenV::opNeq, lang::TokenV::opLtEq, lang::TokenV::opGtEq,
+    lang::TokenV::opLShift, lang::TokenV::opRShift, lang::TokenV::opBoolAnd, lang::TokenV::opBoolOr,
+    lang::TokenV::opPlusAssign, lang::TokenV::opHyphenAssign, lang::TokenV::opStarAssign, lang::TokenV::opSlashAssign,
+    lang::TokenV::opModAssign, lang::TokenV::opAndAssign, lang::TokenV::opOrAssign,
+    lang::TokenV::opLShiftAssign, lang::TokenV::opRShiftAssign, lang::TokenV::opBoolAndAssign, lang::TokenV::opBoolORAssign,
+    lang::TokenV::opArrow, lang::TokenV::opAccess, lang::TokenV::opLabel
+  };
+
+  CommonOpsSuccess(lex, kinds);
+}
+
+TEST_CASE("Operators_DotPrefixedSpecials", "[operators][dot][special]") {
+  constexpr std::array<std::string_view, 4> lex = {
+    ".#", ".[", ".*", ".@"
+  };
+  constexpr std::array<lang::TokenV, 4> kinds = {
+    lang::TokenV::opSliceSize, lang::TokenV::opPtrSlice, lang::TokenV::opSlicePtr, lang::TokenV::opReflect
+  };
+
+  CommonOpsSuccess(lex, kinds);
+}
+
+TEST_CASE("Operators_GreedyLongestMatch", "[operators][greedy]") {
+  // Ensure longest valid operator is selected.
+  constexpr std::array<std::string_view, 8> lex = {
+    ">>=", "<<=", "&&=", "||=",
+    ">=", "<=", "->", "::"
+  };
+  constexpr std::array<lang::TokenV, 8> kinds = {
+    lang::TokenV::opRShiftAssign, lang::TokenV::opLShiftAssign, lang::TokenV::opBoolAndAssign, lang::TokenV::opBoolORAssign,
+    lang::TokenV::opGtEq, lang::TokenV::opLtEq, lang::TokenV::opArrow, lang::TokenV::opAccess
+  };
+
+  CommonOpsSuccess(lex, kinds);
+}
+
+TEST_CASE("Operators_BoundariesWhitespace", "[operators][boundaries]") {
+  // '= =' should not be '=='
+  const std::string source = "=\n=";
+  lang::Tokenizer tkz{ source };
+
+  auto t1 = tkz.peek();
+  REQUIRE(t1.has_value());
+  REQUIRE(t1->value == lang::TokenV::opAssign);
+  REQUIRE(tkz.consume().has_value());
+
+  auto t2 = tkz.peek();
+  REQUIRE(t2.has_value());
+  REQUIRE(t2->value == lang::TokenV::opAssign);
+  REQUIRE(tkz.consume().has_value());
+
+  auto eof = tkz.peek();
+  REQUIRE(eof.has_value());
+  REQUIRE(eof->value == lang::TokenV::tkEOF);
+}
+
+TEST_CASE("Operators_ContextAccessLabelArrow", "[operators][context]") {
+  const std::string source = "ns::name := src->field";
+  lang::Tokenizer tkz{ source };
+
+  auto t1 = tkz.peek();
+  REQUIRE(t1.has_value());
+  REQUIRE(t1->value == lang::TokenV::tkIdentifier);
+  REQUIRE(t1->strValue == "ns");
+  REQUIRE(tkz.consume().has_value());
+
+  auto t2 = tkz.peek();
+  REQUIRE(t2.has_value());
+  REQUIRE(t2->value == lang::TokenV::opAccess);
+  REQUIRE(t2->strValue == "::");
+  REQUIRE(tkz.consume().has_value());
+
+  auto t3 = tkz.peek();
+  REQUIRE(t3.has_value());
+  REQUIRE(t3->value == lang::TokenV::tkIdentifier);
+  REQUIRE(t3->strValue == "name");
+  REQUIRE(tkz.consume().has_value());
+
+  auto t4 = tkz.peek();
+  REQUIRE(t4.has_value());
+  REQUIRE(t4->value == lang::TokenV::opLabel);
+  REQUIRE(t4->strValue == ":=");
+  REQUIRE(tkz.consume().has_value());
+
+  auto t5 = tkz.peek();
+  REQUIRE(t5.has_value());
+  REQUIRE(t5->value == lang::TokenV::tkIdentifier);
+  REQUIRE(t5->strValue == "src");
+  REQUIRE(tkz.consume().has_value());
+
+  auto t6 = tkz.peek();
+  REQUIRE(t6.has_value());
+  REQUIRE(t6->value == lang::TokenV::opArrow);
+  REQUIRE(t6->strValue == "->");
+  REQUIRE(tkz.consume().has_value());
+
+  auto t7 = tkz.peek();
+  REQUIRE(t7.has_value());
+  REQUIRE(t7->value == lang::TokenV::tkIdentifier);
+  REQUIRE(t7->strValue == "field");
+  REQUIRE(tkz.consume().has_value());
+
+  auto eof = tkz.peek();
+  REQUIRE(eof.has_value());
+  REQUIRE(eof->value == lang::TokenV::tkEOF);
+}
+
+TEST_CASE("Operators_InvalidStandalone_Error", "[operators][error]") {
+  // '#' and '@' alone are not valid tokens (only .# and .@ are valid).
+  const std::vector<std::string> invalids = { "#", "@", "`" };
+
+  for (const auto &src : invalids) {
+    lang::Tokenizer tkz{ src };
+    auto tok = tkz.peek();
+    REQUIRE_FALSE(tok.has_value());
+    const auto &err = tok.error();
+    REQUIRE(err.message.find("Invalid") != std::string::npos);
+  }
+}
--- a/tests/Tokenizer/src/Strings.cpp
+++ b/tests/Tokenizer/src/Strings.cpp
@ -0,0 +1,168 @@
+#include <catch2/catch_test_macros.hpp>
+
+#include <array>
+#include <string>
+
+#include <artichoke/Tokenizer/Tokenizer.hpp>
+#include <artichoke/Util/Expected.hpp>
+#include <Utils.hpp>
+
+namespace lang = arti::lang;
+
+template <std::size_t N>
+static void CommonLiteralsSuccess(
+  lang::TokenV kind,
+  const std::array<std::string_view, N> &lexemes
+) {
+  const std::string source = SourceFromTokens(lexemes);
+
+  std::size_t it = 0;
+  lang::Tokenizer tkz{ source };
+
+  for (auto token : tkz.range()) {
+    REQUIRE(token.has_value());
+    REQUIRE(token->value == kind);
+    REQUIRE(token->strValue == lexemes.at(it++));
+  }
+
+  REQUIRE(it == lexemes.size());
+  REQUIRE(tkz.peek().has_value());
+  REQUIRE(tkz.peek()->value == lang::TokenV::tkEOF);
+}
+
+TEST_CASE("Strings_Simple", "[strings][valid]") {
+  constexpr std::array<std::string_view, 5> lexemes = {
+    R"("a")",
+    R"("hello")",
+    R"("with spaces")",
+    R"("12345")",
+    R"Q("!@#$%^&*()")Q"
+  };
+
+  CommonLiteralsSuccess(lang::TokenV::tkString, lexemes);
+}
+
+TEST_CASE("Strings_Escapes", "[strings][valid][escapes]") {
+  // Validate common escape sequences remain part of lexeme text.
+  constexpr std::array<std::string_view, 5> lexemes = {
+    R"("quote: \"")",
+    R"("backslash: \\")",
+    R"("newline: \n")",
+    R"("tab: \t")",
+    R"("mix: \"\\\n\t")"
+  };
+
+  CommonLiteralsSuccess(lang::TokenV::tkString, lexemes);
+}
+
+TEST_CASE("Strings_OperatorsAdjacency", "[strings][operators]") {
+  // "foo"+"bar" -> tkString, opPlus, tkString
+  const std::string source = R"("foo"+"bar")";
+  lang::Tokenizer tkz{ source };
+
+  auto t1 = tkz.peek();
+  REQUIRE(t1.has_value());
+  REQUIRE(t1->value == lang::TokenV::tkString);
+  REQUIRE(t1->strValue == R"("foo")");
+  REQUIRE(tkz.consume().has_value());
+
+  auto t2 = tkz.peek();
+  REQUIRE(t2.has_value());
+  REQUIRE(t2->value == lang::TokenV::opPlus);
+  REQUIRE(tkz.consume().has_value());
+
+  auto t3 = tkz.peek();
+  REQUIRE(t3.has_value());
+  REQUIRE(t3->value == lang::TokenV::tkString);
+  REQUIRE(t3->strValue == R"("bar")");
+  REQUIRE(tkz.consume().has_value());
+
+  auto eof = tkz.peek();
+  REQUIRE(eof.has_value());
+  REQUIRE(eof->value == lang::TokenV::tkEOF);
+}
+
+TEST_CASE("Strings_Unterminated_Error", "[strings][error]") {
+  // Missing closing quote should yield an error.
+  const std::string source = "\"unterminated";
+  lang::Tokenizer tkz{ source };
+
+  auto errTok = tkz.peek();
+  REQUIRE_FALSE(errTok.has_value());
+  const auto &err = errTok.error();
+
+  REQUIRE(
+    err.message.find("Invalid literal") != std::string::npos
+  );
+}
+
+TEST_CASE("Chars_Simple", "[chars][valid]") {
+  constexpr std::array<std::string_view, 5> lexemes = {
+    R"('a')",
+    R"('Z')",
+    R"('0')",
+    R"('_')",
+    R"('$')"
+  };
+
+  CommonLiteralsSuccess(lang::TokenV::tkCharacter, lexemes);
+}
+
+TEST_CASE("Chars_Escapes", "[chars][valid][escapes]") {
+  constexpr std::array<std::string_view, 4> lexemes = {
+    R"('\n')",
+    R"('\t')",
+    R"('\\')",
+    R"('\'')"
+  };
+
+  CommonLiteralsSuccess(lang::TokenV::tkCharacter, lexemes);
+}
+
+TEST_CASE("Chars_Invalid_Empty", "[chars][error]") {
+  const std::string source = "''";
+  lang::Tokenizer tkz{ source };
+
+  auto errTok = tkz.peek();
+  REQUIRE_FALSE(errTok.has_value());
+  const auto &err = errTok.error();
+  REQUIRE(
+    err.message.find("Invalid literal") != std::string::npos
+  );
+}
+
+TEST_CASE("Chars_Invalid_Multiple", "[chars][error]") {
+  const std::string source = "'ab'";
+  lang::Tokenizer tkz{ source };
+
+  auto errTok = tkz.peek();
+  REQUIRE_FALSE(errTok.has_value());
+  const auto &err = errTok.error();
+  REQUIRE(
+    err.message.find("Invalid literal") != std::string::npos
+  );
+}
+
+TEST_CASE("Chars_Unterminated", "[chars][error]") {
+  const std::string source = "'a";
+  lang::Tokenizer tkz{ source };
+
+  auto errTok = tkz.peek();
+  REQUIRE_FALSE(errTok.has_value());
+  const auto &err = errTok.error();
+  REQUIRE(
+    err.message.find("Invalid literal") != std::string::npos
+  );
+}
+
+TEST_CASE("Chars_InvalidEscape", "[chars][error][.escapes]") {
+  const std::string source = "'\\x'";
+  lang::Tokenizer tkz{ source };
+
+  auto errTok = tkz.peek();
+  REQUIRE_FALSE(errTok.has_value());
+  const auto &err = errTok.error();
+  REQUIRE(
+    err.message.find("Invalid literal") != std::string::npos
+  );
+}
--- a/tests/cmake/dependencies.cmake
+++ b/tests/cmake/dependencies.cmake
@ -0,0 +1,18 @@
+include(FetchContent)
+
+# Get CPM
+file(
+  DOWNLOAD
+  https://github.com/cpm-cmake/CPM.cmake/releases/download/v0.40.8/CPM.cmake
+  ${CMAKE_CURRENT_BINARY_DIR}/cmake/CPM.cmake
+  EXPECTED_HASH
+    SHA256=78ba32abdf798bc616bab7c73aac32a17bbd7b06ad9e26a6add69de8f3ae4791
+)
+include(${CMAKE_CURRENT_BINARY_DIR}/cmake/CPM.cmake)
+
+# Get dependencies
+
+CPMAddPackage("gh:catchorg/Catch2@3.6.0")
+
+# Include Catch2 CMake scripts
+list(APPEND CMAKE_MODULE_PATH ${Catch2_SOURCE_DIR}/extras)
--- a/tests/include/Utils.hpp
+++ b/tests/include/Utils.hpp
@ -0,0 +1,58 @@
+#pragma once
+
+#include <random>
+
+#include <artichoke/Coroutine/Generator.hpp>
+
+template <std::ranges::range R1, std::ranges::range R2>
+requires(
+  std::is_convertible_v<std::ranges::range_value_t<R1>, std::string_view> and
+  std::is_same_v<std::ranges::range_value_t<R1>, std::ranges::range_value_t<R2>>
+)
+arti::lang::Generator<std::ranges::range_value_t<R1>>
+InterleaveRanges(R1 &&r1, R2 &&r2) {
+  auto it1 = std::ranges::begin(r1);
+  auto end1 = std::ranges::end(r1);
+
+  auto it2 = std::ranges::begin(r2);
+  auto end2 = std::ranges::end(r2);
+
+  while (it1 != end1 && it2 != end2) {
+    yield *it1;
+    ++it1;
+    yield *it2;
+    ++it2;
+  }
+}
+
+static arti::lang::Generator<std::string_view>
+WhitespaceGenerator(uint32_t maxLen = 5) {
+  constexpr std::array<char, 3> spaceChars{ ' ', '\t', '\n' };
+
+  std::string str;
+  std::random_device device;
+  std::mt19937 engine(device());
+  std::uniform_int_distribution<uint32_t> dist(1, maxLen);
+  std::uniform_int_distribution<uint32_t> distChars(0, 2);
+
+  str.reserve(maxLen);
+
+  while (true) {
+    str.resize(0);
+
+    auto sz = dist(engine);
+
+    for (uint32_t i = 0; i < sz; ++i) {
+      str += spaceChars[distChars(engine)];
+    }
+
+    yield str;
+  }
+}
+
+template <std::ranges::range R>
+requires(std::is_same_v<std::ranges::range_value_t<R>, std::string_view>)
+static std::string SourceFromTokens(R &&tokens) {
+  return InterleaveRanges(tokens, WhitespaceGenerator(10)) | std::views::join |
+         std::ranges::to<std::string>();
+}