From 010b3e2e1eb3870724bbde6de7a0929b20bf2f75 Mon Sep 17 00:00:00 2001 From: Fabrice Date: Tue, 3 Mar 2026 07:57:50 +0100 Subject: working on utf8 handling and lexing --- src/tokenizer.cc | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 src/tokenizer.cc (limited to 'src/tokenizer.cc') diff --git a/src/tokenizer.cc b/src/tokenizer.cc new file mode 100644 index 0000000..54634de --- /dev/null +++ b/src/tokenizer.cc @@ -0,0 +1,58 @@ +#ifndef TOKENIZER_CC +#define TOKENIZER_CC + +#include "common.cc" +#include "source.cc" +#include "utf8.cc" +#include "token.cc" + +struct Tokenizer { + const Buffer* buffer; + Buffer_Stack* stack; + + Tokenizer(Buffer_Stack* stack) : buffer(nullptr), stack(stack) {} +}; + +static inline const Buffer* tokenizer_get_buffer(Tokenizer* tokenizer) { + assert_neq(tokenizer, nullptr); + + if(tokenizer->buffer != nullptr) return tokenizer->buffer; + + Buffer* buffer = nullptr; + if(!buffer_stack_pop(tokenizer->stack, &buffer)) return nullptr; + + tokenizer->buffer = buffer; + return buffer; +} + +static inline char tokenizer_advance(const Tokenizer* tokenizer, usize* offset) { + const String text = tokenizer->buffer->content; + + const unsigned char* c = text[*offset]; + wchar wc = (wchar)*c; + + u8 nobytes = utf8_nobytes(wc); + if(nobytes > 1) panic("no support for multi-byte chars: %d:%d", wc, nobytes); + + offset += nobytes; + return (char)wc; +} + +bool tokenizer_next(Tokenizer* tokenizer, Token* out) { + assert_neq(tokenizer, nullptr); + assert_neq(out, nullptr); + +again: + const Buffer* buffer = tokenizer_get_buffer(tokenizer); + if(buffer == nullptr) return false; + + usize offset = buffer->cursor; + if(offset == buffer->content.length) { + tokenizer->buffer = nullptr; + goto again; + } + + return false; +} + +#endif -- cgit v1.2.3