diff options
Diffstat (limited to 'src/tokenizer.cc')
| -rw-r--r-- | src/tokenizer.cc | 31 |
1 files changed, 19 insertions, 12 deletions
diff --git a/src/tokenizer.cc b/src/tokenizer.cc index 8725e55..acd9627 100644 --- a/src/tokenizer.cc +++ b/src/tokenizer.cc @@ -30,8 +30,8 @@ static inline String tokenizer_make_lexeme(const Tokenizer* tokenizer, } static inline Token tokenizer_make_token(const Tokenizer* tokenizer, - Token_Kind kind, - usize start, usize end) { + Token_Kind kind, usize start, + usize end) { assert_neq(tokenizer, nullptr); String lexeme = tokenizer_make_lexeme(tokenizer, start, end); @@ -57,7 +57,8 @@ static bool tokenizer_get_buffer(Tokenizer* tokenizer, usize* cursor) { } [[nodiscard]] static bool tokenizer_char(const Tokenizer* tokenizer, - usize offset, wchar* out) { + usize offset, u8* out_nobytes, + wchar* out_char) { const String text = tokenizer->buffer->content; assert_ste(offset, text.length); @@ -67,7 +68,8 @@ static bool tokenizer_get_buffer(Tokenizer* tokenizer, usize* cursor) { u8 nobytes = utf8_nobytes(c); if (nobytes > 1) panic("no support for multi-byte chars: %c:%d", c, nobytes); - *out = c; + *out_nobytes = nobytes; + *out_char = (wchar)c; return true; } @@ -76,13 +78,16 @@ static void tokenizer_advance(usize* offset, usize nbytes) { *offset += nbytes; } -static Token tokenizer_lex_identifier(Tokenizer* tokenizer, usize start, usize *offset) { +static Token tokenizer_lex_identifier(Tokenizer* tokenizer, usize start, + usize* offset) { assert_neq(tokenizer, nullptr); wchar c; - while (tokenizer_char(tokenizer, *offset, &c)) { + u8 nobytes; + + while (tokenizer_char(tokenizer, *offset, &nobytes, &c)) { if (!utf8_is_alnum(c) && c != '_') break; - tokenizer_advance(offset, 1); + tokenizer_advance(offset, nobytes); } return tokenizer_make_token(tokenizer, Token_Kind_Identifier, start, *offset); @@ -93,12 +98,15 @@ bool tokenizer_next(Tokenizer* tokenizer, Token* out) { assert_neq(out, nullptr); usize cursor; - if(unlikely(!tokenizer_get_buffer(tokenizer, &cursor))) return false; + if (unlikely(!tokenizer_get_buffer(tokenizer, &cursor))) return false; usize advance = cursor; wchar c; - (void)tokenizer_char(tokenizer, advance, &c); - + u8 nobytes; + + (void)tokenizer_char(tokenizer, advance, &nobytes, &c); + tokenizer_advance(&advance, nobytes); + if (utf8_is_identifier(c)) { *out = tokenizer_lex_identifier(tokenizer, cursor, &advance); goto out; @@ -106,9 +114,8 @@ bool tokenizer_next(Tokenizer* tokenizer, Token* out) { switch (c) { default: - tokenizer_advance(&advance, 1); *out = tokenizer_make_token(tokenizer, Token_Kind_Invalid_Char, cursor, - advance); + advance); } out: |
