#ifndef TOKENIZER_CC #define TOKENIZER_CC #include "common.cc" #include "source.cc" #include "token.cc" #include "utf8.cc" struct Tokenizer { Buffer* buffer; Buffer_Stack* stack; Tokenizer(Buffer_Stack* stack) : buffer(nullptr), stack(stack) {} }; static inline Span tokenizer_make_span(const Tokenizer* tokenizer, usize start, usize end) { assert_neq(tokenizer, nullptr); Buffer* buffer = tokenizer->buffer; return Span(buffer->file, start, end); } static inline String tokenizer_make_lexeme(const Tokenizer* tokenizer, usize start, usize end) { assert_neq(tokenizer, nullptr); Buffer* buffer = tokenizer->buffer; return String(buffer->content[start], end - start); } static inline Token tokenizer_make_token(const Tokenizer* tokenizer, Token_Kind kind, usize start, usize end) { assert_neq(tokenizer, nullptr); String lexeme = tokenizer_make_lexeme(tokenizer, start, end); Span span = tokenizer_make_span(tokenizer, start, end); return Token(kind, lexeme, span); } static bool tokenizer_get_buffer(Tokenizer* tokenizer, usize* cursor) { assert_neq(tokenizer, nullptr); Buffer* curr = tokenizer->buffer; if (likely(curr != nullptr)) { *cursor = curr->cursor; if (*cursor < curr->content.length) return true; } if (!buffer_stack_pop(tokenizer->stack, &curr)) return false; tokenizer->buffer = curr; *cursor = curr->cursor; return true; } [[nodiscard]] static bool tokenizer_char(const Tokenizer* tokenizer, usize offset, u8* out_nobytes, wchar* out_char) { const String text = tokenizer->buffer->content; assert_ste(offset, text.length); if (offset == text.length) return false; unsigned char c = *text[offset]; u8 nobytes = utf8_nobytes(c); if (nobytes > 1) panic("no support for multi-byte chars: %c:%d", c, nobytes); *out_nobytes = nobytes; *out_char = (wchar)c; return true; } static void tokenizer_advance(usize* offset, usize nbytes) { assert_neq(offset, nullptr); *offset += nbytes; } static Token tokenizer_lex_identifier(Tokenizer* tokenizer, usize start, usize* offset) { assert_neq(tokenizer, nullptr); wchar c; u8 nobytes; while (tokenizer_char(tokenizer, *offset, &nobytes, &c)) { if (!utf8_is_alnum(c) && c != '_') break; tokenizer_advance(offset, nobytes); } return tokenizer_make_token(tokenizer, Token_Kind_Identifier, start, *offset); } bool tokenizer_next(Tokenizer* tokenizer, Token* out) { assert_neq(tokenizer, nullptr); assert_neq(out, nullptr); usize cursor; if (unlikely(!tokenizer_get_buffer(tokenizer, &cursor))) return false; usize advance = cursor; wchar c; u8 nobytes; (void)tokenizer_char(tokenizer, advance, &nobytes, &c); tokenizer_advance(&advance, nobytes); if (utf8_is_identifier(c)) { *out = tokenizer_lex_identifier(tokenizer, cursor, &advance); goto out; } switch (c) { default: *out = tokenizer_make_token(tokenizer, Token_Kind_Invalid_Char, cursor, advance); } out: tokenizer->buffer->cursor = advance; return true; } #endif