From 010b3e2e1eb3870724bbde6de7a0929b20bf2f75 Mon Sep 17 00:00:00 2001 From: Fabrice Date: Tue, 3 Mar 2026 07:57:50 +0100 Subject: working on utf8 handling and lexing --- src/common.cc | 1 - src/source.cc | 16 ++++++++-------- src/token.cc | 2 ++ src/tokenizer.cc | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/utf8.cc | 25 ++++++++++++++++++++++++ src/voidc.cc | 9 ++------- 6 files changed, 95 insertions(+), 16 deletions(-) create mode 100644 src/tokenizer.cc create mode 100644 src/utf8.cc diff --git a/src/common.cc b/src/common.cc index 64c462b..de2f6de 100644 --- a/src/common.cc +++ b/src/common.cc @@ -122,7 +122,6 @@ static inline void link_remove(Link* item) { Link* next = item->next; if (prev != nullptr) prev->next = next; - if (next != nullptr) next->prev = prev; item->prev = item->next = nullptr; diff --git a/src/source.cc b/src/source.cc index ff1257b..c77c089 100644 --- a/src/source.cc +++ b/src/source.cc @@ -71,24 +71,24 @@ struct Buffer_Stack { Link* stack; }; -void buffer_stack_push(Buffer_Stack* manager, Buffer* b) { - assert_neq(manager, nullptr); +void buffer_stack_push(Buffer_Stack* stack, Buffer* b) { + assert_neq(stack, nullptr); assert_neq(b, nullptr); - if (likely(manager->stack != nullptr)) link_after(manager->stack, &b->link); - manager->stack = &b->link; + if (likely(stack->stack != nullptr)) link_after(stack->stack, &b->link); + stack->stack = &b->link; } -bool buffer_stack_pop(Buffer_Stack* manager, Buffer** b) { - assert_neq(manager, nullptr); +bool buffer_stack_pop(Buffer_Stack* stack, Buffer** b) { + assert_neq(stack, nullptr); assert_neq(b, nullptr); - Link* link = manager->stack; + Link* link = stack->stack; if (unlikely(link == nullptr)) return false; Link* next = link->prev; link_remove(link); - manager->stack = next; + stack->stack = next; Buffer* buffer = containerof(Buffer, link, link); *b = buffer; diff --git a/src/token.cc b/src/token.cc index ea0a4e5..e53abfe 100644 --- a/src/token.cc +++ b/src/token.cc @@ -27,6 +27,8 @@ struct Token { Token_Kind kind; String text; Span span; + + Token(Token_Kind kind, String text, Span span) : kind(kind), text(text), span(span) {} }; #endif diff --git a/src/tokenizer.cc b/src/tokenizer.cc new file mode 100644 index 0000000..54634de --- /dev/null +++ b/src/tokenizer.cc @@ -0,0 +1,58 @@ +#ifndef TOKENIZER_CC +#define TOKENIZER_CC + +#include "common.cc" +#include "source.cc" +#include "utf8.cc" +#include "token.cc" + +struct Tokenizer { + const Buffer* buffer; + Buffer_Stack* stack; + + Tokenizer(Buffer_Stack* stack) : buffer(nullptr), stack(stack) {} +}; + +static inline const Buffer* tokenizer_get_buffer(Tokenizer* tokenizer) { + assert_neq(tokenizer, nullptr); + + if(tokenizer->buffer != nullptr) return tokenizer->buffer; + + Buffer* buffer = nullptr; + if(!buffer_stack_pop(tokenizer->stack, &buffer)) return nullptr; + + tokenizer->buffer = buffer; + return buffer; +} + +static inline char tokenizer_advance(const Tokenizer* tokenizer, usize* offset) { + const String text = tokenizer->buffer->content; + + const unsigned char* c = text[*offset]; + wchar wc = (wchar)*c; + + u8 nobytes = utf8_nobytes(wc); + if(nobytes > 1) panic("no support for multi-byte chars: %d:%d", wc, nobytes); + + offset += nobytes; + return (char)wc; +} + +bool tokenizer_next(Tokenizer* tokenizer, Token* out) { + assert_neq(tokenizer, nullptr); + assert_neq(out, nullptr); + +again: + const Buffer* buffer = tokenizer_get_buffer(tokenizer); + if(buffer == nullptr) return false; + + usize offset = buffer->cursor; + if(offset == buffer->content.length) { + tokenizer->buffer = nullptr; + goto again; + } + + return false; +} + +#endif diff --git a/src/utf8.cc b/src/utf8.cc new file mode 100644 index 0000000..5f3a57a --- /dev/null +++ b/src/utf8.cc @@ -0,0 +1,25 @@ +#ifndef UTF8_CC +#define UTF8_CC + +typedef i32 wchar; + +#define UTF8_1BYTE 0x80 + +#define UTF8_2SHIFT 5 +#define UTF8_2BYTE 0x6 + +#define UTF8_3SHIFT 4 +#define UTF8_3BYTE 0xE + +#define UTF8_4SHIFT 3 +#define UTF8_4BYTE 0x1E + +inline u8 utf8_nobytes(wchar c) { + if(c < UTF8_1BYTE) return 1; + if((c >> UTF8_2SHIFT) == UTF8_2BYTE) return 2; + if((c >> UTF8_3SHIFT) == UTF8_3BYTE) return 3; + if((c >> UTF8_4SHIFT) == UTF8_4BYTE) return 4; + panic("what even is: %d\n", c); +} + +#endif diff --git a/src/voidc.cc b/src/voidc.cc index fbb9577..e6c76e8 100644 --- a/src/voidc.cc +++ b/src/voidc.cc @@ -1,9 +1,9 @@ #include -#include #include "common.cc" #include "memory.cc" #include "source.cc" +#include "tokenizer.cc" static const char* SOURCE = R"( #include @@ -26,13 +26,8 @@ int main() { bool ret = buffer_init(heap_allocator(), &source, &file, &buffer); if (!ret) return EXIT_FAILURE; - buffer_stack_push(&stack, buffer); buffer_stack_push(&stack, buffer); - int c = 0; - while(buffer_stack_pop(&stack, &buffer)) { - c += 1; - } + Tokenizer tokenizer(&stack); - assert(c == 2); } -- cgit v1.2.3