diff options
Diffstat (limited to 'src/tokenizer.cc')
| -rw-r--r-- | src/tokenizer.cc | 78 |
1 files changed, 57 insertions, 21 deletions
diff --git a/src/tokenizer.cc b/src/tokenizer.cc index f2efda4..acd9627 100644 --- a/src/tokenizer.cc +++ b/src/tokenizer.cc @@ -29,44 +29,68 @@ static inline String tokenizer_make_lexeme(const Tokenizer* tokenizer, return String(buffer->content[start], end - start); } -static inline void tokenizer_make_token(const Tokenizer* tokenizer, - Token* token, Token_Kind kind, - usize start, usize end) { +static inline Token tokenizer_make_token(const Tokenizer* tokenizer, + Token_Kind kind, usize start, + usize end) { assert_neq(tokenizer, nullptr); - assert_neq(token, nullptr); String lexeme = tokenizer_make_lexeme(tokenizer, start, end); Span span = tokenizer_make_span(tokenizer, start, end); - *token = Token(kind, lexeme, span); + return Token(kind, lexeme, span); } -static Buffer* tokenizer_get_buffer(Tokenizer* tokenizer, usize* cursor) { +static bool tokenizer_get_buffer(Tokenizer* tokenizer, usize* cursor) { assert_neq(tokenizer, nullptr); Buffer* curr = tokenizer->buffer; if (likely(curr != nullptr)) { *cursor = curr->cursor; - if(*cursor < curr->content.length) return curr; + if (*cursor < curr->content.length) return true; } - if (!buffer_stack_pop(tokenizer->stack, &curr)) return nullptr; + if (!buffer_stack_pop(tokenizer->stack, &curr)) return false; tokenizer->buffer = curr; *cursor = curr->cursor; - return curr; + return true; } -static char tokenizer_advance(const Tokenizer* tokenizer, - usize* offset) { +[[nodiscard]] static bool tokenizer_char(const Tokenizer* tokenizer, + usize offset, u8* out_nobytes, + wchar* out_char) { const String text = tokenizer->buffer->content; - unsigned char c = *text[*offset]; + assert_ste(offset, text.length); + if (offset == text.length) return false; + + unsigned char c = *text[offset]; u8 nobytes = utf8_nobytes(c); if (nobytes > 1) panic("no support for multi-byte chars: %c:%d", c, nobytes); - *offset += nobytes; - return c; + *out_nobytes = nobytes; + *out_char = (wchar)c; + return true; +} + +static void tokenizer_advance(usize* offset, usize nbytes) { + assert_neq(offset, nullptr); + *offset += nbytes; +} + +static Token tokenizer_lex_identifier(Tokenizer* tokenizer, usize start, + usize* offset) { + assert_neq(tokenizer, nullptr); + + wchar c; + u8 nobytes; + + while (tokenizer_char(tokenizer, *offset, &nobytes, &c)) { + if (!utf8_is_alnum(c) && c != '_') break; + tokenizer_advance(offset, nobytes); + } + + return tokenizer_make_token(tokenizer, Token_Kind_Identifier, start, *offset); } bool tokenizer_next(Tokenizer* tokenizer, Token* out) { @@ -74,16 +98,28 @@ bool tokenizer_next(Tokenizer* tokenizer, Token* out) { assert_neq(out, nullptr); usize cursor; - Buffer* buffer = tokenizer_get_buffer(tokenizer, &cursor); - if (buffer == nullptr) return false; + if (unlikely(!tokenizer_get_buffer(tokenizer, &cursor))) return false; usize advance = cursor; - tokenizer_advance(tokenizer, &advance); - Token token = {}; - tokenizer_make_token(tokenizer, &token, Token_Kind_Eof, cursor, advance); + wchar c; + u8 nobytes; + + (void)tokenizer_char(tokenizer, advance, &nobytes, &c); + tokenizer_advance(&advance, nobytes); + + if (utf8_is_identifier(c)) { + *out = tokenizer_lex_identifier(tokenizer, cursor, &advance); + goto out; + } + + switch (c) { + default: + *out = tokenizer_make_token(tokenizer, Token_Kind_Invalid_Char, cursor, + advance); + } - *out = token; - buffer->cursor = advance; +out: + tokenizer->buffer->cursor = advance; return true; } |
