summaryrefslogtreecommitdiff
path: root/src/tokenizer.cc
diff options
context:
space:
mode:
authorFabrice <fabrice@schaub-dev.xyz>2026-03-03 16:15:27 +0100
committerFabrice <fabrice@schaub-dev.xyz>2026-03-03 16:15:27 +0100
commit710197797f399a17fc3bcabe2d3816e728487571 (patch)
tree1f27be9e39a8cb07765e74a9f3a3d3810a324019 /src/tokenizer.cc
parent448b3baa76b6bd25e972e1134941cb649b6a91d0 (diff)
working on identifiers
Diffstat (limited to 'src/tokenizer.cc')
-rw-r--r--src/tokenizer.cc35
1 files changed, 30 insertions, 5 deletions
diff --git a/src/tokenizer.cc b/src/tokenizer.cc
index 26763eb..029d440 100644
--- a/src/tokenizer.cc
+++ b/src/tokenizer.cc
@@ -57,15 +57,31 @@ static Buffer* tokenizer_get_buffer(Tokenizer* tokenizer, usize* cursor) {
return curr;
}
-static wchar tokenizer_advance(const Tokenizer* tokenizer, usize* offset) {
+[[nodiscard]] static bool tokenizer_advance(const Tokenizer* tokenizer,
+ usize* offset, wchar* out) {
const String text = tokenizer->buffer->content;
+ usize curr_offset = *offset;
- unsigned char c = *text[*offset];
+ assert_ste(curr_offset, text.length);
+ if (curr_offset == text.length) return false;
+
+ unsigned char c = *text[curr_offset];
u8 nobytes = utf8_nobytes(c);
if (nobytes > 1) panic("no support for multi-byte chars: %c:%d", c, nobytes);
*offset += nobytes;
- return (wchar)c;
+ *out = c;
+ return true;
+}
+
+static void tokenizer_lex_identifier(Tokenizer* tokenizer, usize* offset) {
+ assert_neq(tokenizer, nullptr);
+ assert_neq(offset, nullptr);
+
+ wchar c;
+ while (tokenizer_advance(tokenizer, offset, &c))
+ if (!utf8_is_alnum(c) || c == '_') break;
+
}
bool tokenizer_next(Tokenizer* tokenizer, Token* out) {
@@ -75,9 +91,17 @@ bool tokenizer_next(Tokenizer* tokenizer, Token* out) {
usize cursor;
Buffer* buffer = tokenizer_get_buffer(tokenizer, &cursor);
if (buffer == nullptr) return false;
-
usize advance = cursor;
- wchar c = tokenizer_advance(tokenizer, &advance);
+
+ wchar c;
+ (void)tokenizer_advance(tokenizer, &advance,
+ &c); // We just checked that we are not at the end
+
+ if (utf8_is_identifier(c)) {
+ tokenizer_lex_identifier(tokenizer, &advance);
+ tokenizer_make_token(tokenizer, out, Token_Kind_Identifier, cursor, advance);
+ goto out;
+ }
switch (c) {
default:
@@ -85,6 +109,7 @@ bool tokenizer_next(Tokenizer* tokenizer, Token* out) {
advance);
}
+out:
buffer->cursor = advance;
return true;
}