summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorFabrice <fabrice@schaub-dev.xyz>2026-03-03 17:29:14 +0100
committerFabrice <fabrice@schaub-dev.xyz>2026-03-03 17:29:14 +0100
commitff6bfbe12724f977e5cfe4e7baadc8a4eb82a7db (patch)
tree5737e48174623a609fa6f2ef97abea97a126eddf /src
parentfa029dea1bcd4b90df50743ebc5b7b573c4f39df (diff)
correctly advance
Diffstat (limited to 'src')
-rw-r--r--src/tokenizer.cc31
-rw-r--r--src/utf8.cc4
2 files changed, 20 insertions, 15 deletions
diff --git a/src/tokenizer.cc b/src/tokenizer.cc
index 8725e55..acd9627 100644
--- a/src/tokenizer.cc
+++ b/src/tokenizer.cc
@@ -30,8 +30,8 @@ static inline String tokenizer_make_lexeme(const Tokenizer* tokenizer,
}
static inline Token tokenizer_make_token(const Tokenizer* tokenizer,
- Token_Kind kind,
- usize start, usize end) {
+ Token_Kind kind, usize start,
+ usize end) {
assert_neq(tokenizer, nullptr);
String lexeme = tokenizer_make_lexeme(tokenizer, start, end);
@@ -57,7 +57,8 @@ static bool tokenizer_get_buffer(Tokenizer* tokenizer, usize* cursor) {
}
[[nodiscard]] static bool tokenizer_char(const Tokenizer* tokenizer,
- usize offset, wchar* out) {
+ usize offset, u8* out_nobytes,
+ wchar* out_char) {
const String text = tokenizer->buffer->content;
assert_ste(offset, text.length);
@@ -67,7 +68,8 @@ static bool tokenizer_get_buffer(Tokenizer* tokenizer, usize* cursor) {
u8 nobytes = utf8_nobytes(c);
if (nobytes > 1) panic("no support for multi-byte chars: %c:%d", c, nobytes);
- *out = c;
+ *out_nobytes = nobytes;
+ *out_char = (wchar)c;
return true;
}
@@ -76,13 +78,16 @@ static void tokenizer_advance(usize* offset, usize nbytes) {
*offset += nbytes;
}
-static Token tokenizer_lex_identifier(Tokenizer* tokenizer, usize start, usize *offset) {
+static Token tokenizer_lex_identifier(Tokenizer* tokenizer, usize start,
+ usize* offset) {
assert_neq(tokenizer, nullptr);
wchar c;
- while (tokenizer_char(tokenizer, *offset, &c)) {
+ u8 nobytes;
+
+ while (tokenizer_char(tokenizer, *offset, &nobytes, &c)) {
if (!utf8_is_alnum(c) && c != '_') break;
- tokenizer_advance(offset, 1);
+ tokenizer_advance(offset, nobytes);
}
return tokenizer_make_token(tokenizer, Token_Kind_Identifier, start, *offset);
@@ -93,12 +98,15 @@ bool tokenizer_next(Tokenizer* tokenizer, Token* out) {
assert_neq(out, nullptr);
usize cursor;
- if(unlikely(!tokenizer_get_buffer(tokenizer, &cursor))) return false;
+ if (unlikely(!tokenizer_get_buffer(tokenizer, &cursor))) return false;
usize advance = cursor;
wchar c;
- (void)tokenizer_char(tokenizer, advance, &c);
-
+ u8 nobytes;
+
+ (void)tokenizer_char(tokenizer, advance, &nobytes, &c);
+ tokenizer_advance(&advance, nobytes);
+
if (utf8_is_identifier(c)) {
*out = tokenizer_lex_identifier(tokenizer, cursor, &advance);
goto out;
@@ -106,9 +114,8 @@ bool tokenizer_next(Tokenizer* tokenizer, Token* out) {
switch (c) {
default:
- tokenizer_advance(&advance, 1);
*out = tokenizer_make_token(tokenizer, Token_Kind_Invalid_Char, cursor,
- advance);
+ advance);
}
out:
diff --git a/src/utf8.cc b/src/utf8.cc
index e2da09d..ace1ec0 100644
--- a/src/utf8.cc
+++ b/src/utf8.cc
@@ -26,9 +26,7 @@ inline bool utf8_is_identifier(wchar c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
}
-inline bool utf8_is_number(wchar c) {
- return ('0' <= c && c <= '9');
-}
+inline bool utf8_is_number(wchar c) { return ('0' <= c && c <= '9'); }
inline bool utf8_is_alnum(wchar c) {
return utf8_is_identifier(c) || utf8_is_number(c);