diff options
| author | Fabrice <fabrice@schaub-dev.xyz> | 2026-03-03 16:15:27 +0100 |
|---|---|---|
| committer | Fabrice <fabrice@schaub-dev.xyz> | 2026-03-03 16:15:27 +0100 |
| commit | 710197797f399a17fc3bcabe2d3816e728487571 (patch) | |
| tree | 1f27be9e39a8cb07765e74a9f3a3d3810a324019 /src/tokenizer.cc | |
| parent | 448b3baa76b6bd25e972e1134941cb649b6a91d0 (diff) | |
working on identifiers
Diffstat (limited to 'src/tokenizer.cc')
| -rw-r--r-- | src/tokenizer.cc | 35 |
1 files changed, 30 insertions, 5 deletions
diff --git a/src/tokenizer.cc b/src/tokenizer.cc index 26763eb..029d440 100644 --- a/src/tokenizer.cc +++ b/src/tokenizer.cc @@ -57,15 +57,31 @@ static Buffer* tokenizer_get_buffer(Tokenizer* tokenizer, usize* cursor) { return curr; } -static wchar tokenizer_advance(const Tokenizer* tokenizer, usize* offset) { +[[nodiscard]] static bool tokenizer_advance(const Tokenizer* tokenizer, + usize* offset, wchar* out) { const String text = tokenizer->buffer->content; + usize curr_offset = *offset; - unsigned char c = *text[*offset]; + assert_ste(curr_offset, text.length); + if (curr_offset == text.length) return false; + + unsigned char c = *text[curr_offset]; u8 nobytes = utf8_nobytes(c); if (nobytes > 1) panic("no support for multi-byte chars: %c:%d", c, nobytes); *offset += nobytes; - return (wchar)c; + *out = c; + return true; +} + +static void tokenizer_lex_identifier(Tokenizer* tokenizer, usize* offset) { + assert_neq(tokenizer, nullptr); + assert_neq(offset, nullptr); + + wchar c; + while (tokenizer_advance(tokenizer, offset, &c)) + if (!utf8_is_alnum(c) || c == '_') break; + } bool tokenizer_next(Tokenizer* tokenizer, Token* out) { @@ -75,9 +91,17 @@ bool tokenizer_next(Tokenizer* tokenizer, Token* out) { usize cursor; Buffer* buffer = tokenizer_get_buffer(tokenizer, &cursor); if (buffer == nullptr) return false; - usize advance = cursor; - wchar c = tokenizer_advance(tokenizer, &advance); + + wchar c; + (void)tokenizer_advance(tokenizer, &advance, + &c); // We just checked that we are not at the end + + if (utf8_is_identifier(c)) { + tokenizer_lex_identifier(tokenizer, &advance); + tokenizer_make_token(tokenizer, out, Token_Kind_Identifier, cursor, advance); + goto out; + } switch (c) { default: @@ -85,6 +109,7 @@ bool tokenizer_next(Tokenizer* tokenizer, Token* out) { advance); } +out: buffer->cursor = advance; return true; } |
