#ifndef TOKENIZER_CC #define TOKENIZER_CC #include "common.cc" #include "source.cc" #include "token.cc" #include "utf8.cc" struct Tokenizer { Buffer* buffer; Buffer_Stack* stack; Tokenizer(Buffer_Stack* stack) : buffer(nullptr), stack(stack) {} }; static inline Span tokenizer_make_span(const Tokenizer* tokenizer, usize start, usize end) { assert_neq(tokenizer, nullptr); Buffer* buffer = tokenizer->buffer; return Span(buffer->file, start, end); } static inline String tokenizer_make_lexeme(const Tokenizer* tokenizer, usize start, usize end) { assert_neq(tokenizer, nullptr); Buffer* buffer = tokenizer->buffer; return String(buffer->content[start], end - start); } static inline void tokenizer_make_token(const Tokenizer* tokenizer, Token* token, Token_Kind kind, usize start, usize end) { assert_neq(tokenizer, nullptr); assert_neq(token, nullptr); String lexeme = tokenizer_make_lexeme(tokenizer, start, end); Span span = tokenizer_make_span(tokenizer, start, end); *token = Token(kind, lexeme, span); } static Buffer* tokenizer_get_buffer(Tokenizer* tokenizer) { assert_neq(tokenizer, nullptr); if (tokenizer->buffer != nullptr) return tokenizer->buffer; Buffer* buffer = nullptr; if (!buffer_stack_pop(tokenizer->stack, &buffer)) return nullptr; tokenizer->buffer = buffer; return buffer; } static char tokenizer_advance(const Tokenizer* tokenizer, usize* offset) { const String text = tokenizer->buffer->content; unsigned char c = *text[*offset]; u8 nobytes = utf8_nobytes(c); if (nobytes > 1) panic("no support for multi-byte chars: %c:%d", c, nobytes); *offset += nobytes; return c; } bool tokenizer_next(Tokenizer* tokenizer, Token* out) { assert_neq(tokenizer, nullptr); assert_neq(out, nullptr); again: Buffer* buffer = tokenizer_get_buffer(tokenizer); if (buffer == nullptr) return false; usize start = buffer->cursor; if (start == buffer->content.length) { tokenizer->buffer = nullptr; goto again; } usize offset = start; tokenizer_advance(tokenizer, &offset); Token token = {}; tokenizer_make_token(tokenizer, &token, Token_Kind_Eof, start, offset); *out = token; buffer->cursor = offset; return true; } #endif