#ifndef TOKENIZER_CC #define TOKENIZER_CC #include "common.cc" #include "source.cc" #include "token.cc" #include "utf8.cc" struct Tokenizer { Buffer* buffer; Buffer_Stack* stack; Tokenizer(Buffer_Stack* stack) : buffer(nullptr), stack(stack) {} }; static inline Buffer* tokenizer_get_buffer(Tokenizer* tokenizer) { assert_neq(tokenizer, nullptr); if (tokenizer->buffer != nullptr) return tokenizer->buffer; Buffer* buffer = nullptr; if (!buffer_stack_pop(tokenizer->stack, &buffer)) return nullptr; tokenizer->buffer = buffer; return buffer; } static inline char tokenizer_advance(const Tokenizer* tokenizer, usize* offset) { const String text = tokenizer->buffer->content; unsigned char c = *text[*offset]; u8 nobytes = utf8_nobytes(c); if (nobytes > 1) panic("no support for multi-byte chars: %c:%d", c, nobytes); *offset += nobytes; return c; } static inline String tokenizer_make_lexeme(const Tokenizer* tokenizer, usize start, usize end) { assert_neq(tokenizer, nullptr); Buffer* buffer = tokenizer->buffer; return String(buffer->content[start], end - start); } bool tokenizer_next(Tokenizer* tokenizer, Token* out) { assert_neq(tokenizer, nullptr); assert_neq(out, nullptr); again: Buffer* buffer = tokenizer_get_buffer(tokenizer); if (buffer == nullptr) return false; usize start = buffer->cursor; if (start == buffer->content.length) { tokenizer->buffer = nullptr; goto again; } usize offset = start; tokenizer_advance(tokenizer, &offset); String lexeme = tokenizer_make_lexeme(tokenizer, start, offset); *out = Token(Token_Kind_Eof, lexeme, Span(buffer->file, 0, 0)); buffer->cursor = offset; return true; } #endif