diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/token.cc | 17 | ||||
| -rw-r--r-- | src/tokenizer.cc | 83 | ||||
| -rw-r--r-- | src/utf8.cc | 10 |
3 files changed, 85 insertions, 25 deletions
diff --git a/src/token.cc b/src/token.cc index ed1f65b..171f7ee 100644 --- a/src/token.cc +++ b/src/token.cc @@ -7,11 +7,18 @@ #include "source.cc" #define TOKEN_KINDS_NOLEX \ - X(Eof) \ X(Invalid_Char) \ - X(Invalid_Literal) - -#define TOKEN_KINDS_SLEX X(Hash, '#') + X(Invalid_Literal) \ + X(Identifier) + +#define TOKEN_KINDS_SLEX \ + X(Hash, '#') \ + X(LBrace, '{') \ + X(RBrace, '}') \ + X(LParen, '(') \ + X(RParen, ')') \ + X(LBracket, '[') \ + X(RBracket, ']') #define TOKEN_KIND(name) Token_Kind_##name @@ -30,7 +37,7 @@ struct Token { String text; Span span; - Token() : kind(Token_Kind_Eof), text(), span() {} + Token() : kind(Token_Kind_Invalid_Char), text(), span() {} Token(Token_Kind kind, String text, Span span) : kind(kind), text(text), span(span) {} }; diff --git a/src/tokenizer.cc b/src/tokenizer.cc index f2efda4..61918a4 100644 --- a/src/tokenizer.cc +++ b/src/tokenizer.cc @@ -29,44 +29,68 @@ static inline String tokenizer_make_lexeme(const Tokenizer* tokenizer, return String(buffer->content[start], end - start); } -static inline void tokenizer_make_token(const Tokenizer* tokenizer, - Token* token, Token_Kind kind, - usize start, usize end) { +static inline Token tokenizer_make_token(const Tokenizer* tokenizer, + Token_Kind kind, usize start, + usize end) { assert_neq(tokenizer, nullptr); - assert_neq(token, nullptr); String lexeme = tokenizer_make_lexeme(tokenizer, start, end); Span span = tokenizer_make_span(tokenizer, start, end); - *token = Token(kind, lexeme, span); + return Token(kind, lexeme, span); } -static Buffer* tokenizer_get_buffer(Tokenizer* tokenizer, usize* cursor) { +static bool tokenizer_get_buffer(Tokenizer* tokenizer, usize* cursor) { assert_neq(tokenizer, nullptr); Buffer* curr = tokenizer->buffer; if (likely(curr != nullptr)) { *cursor = curr->cursor; - if(*cursor < curr->content.length) return curr; + if (*cursor < curr->content.length) return true; } - if (!buffer_stack_pop(tokenizer->stack, &curr)) return nullptr; + if (!buffer_stack_pop(tokenizer->stack, &curr)) return false; tokenizer->buffer = curr; *cursor = curr->cursor; - return curr; + return true; } -static char tokenizer_advance(const Tokenizer* tokenizer, - usize* offset) { +[[nodiscard]] static bool tokenizer_char(const Tokenizer* tokenizer, + usize offset, u8* out_nobytes, + wchar* out_char) { const String text = tokenizer->buffer->content; - unsigned char c = *text[*offset]; + assert_ste(offset, text.length); + if (offset == text.length) return false; + + unsigned char c = *text[offset]; u8 nobytes = utf8_nobytes(c); if (nobytes > 1) panic("no support for multi-byte chars: %c:%d", c, nobytes); - *offset += nobytes; - return c; + *out_nobytes = nobytes; + *out_char = (wchar)c; + return true; +} + +static void tokenizer_advance(usize* offset, usize nbytes) { + assert_neq(offset, nullptr); + *offset += nbytes; +} + +static Token tokenizer_lex_identifier(Tokenizer* tokenizer, usize start, + usize* offset) { + assert_neq(tokenizer, nullptr); + + wchar c; + u8 nobytes; + + while (tokenizer_char(tokenizer, *offset, &nobytes, &c)) { + if (!utf8_is_alnum(c) && c != '_') break; + tokenizer_advance(offset, nobytes); + } + + return tokenizer_make_token(tokenizer, Token_Kind_Identifier, start, *offset); } bool tokenizer_next(Tokenizer* tokenizer, Token* out) { @@ -74,15 +98,34 @@ bool tokenizer_next(Tokenizer* tokenizer, Token* out) { assert_neq(out, nullptr); usize cursor; - Buffer* buffer = tokenizer_get_buffer(tokenizer, &cursor); - if (buffer == nullptr) return false; + if (unlikely(!tokenizer_get_buffer(tokenizer, &cursor))) return false; usize advance = cursor; - tokenizer_advance(tokenizer, &advance); - Token token = {}; - tokenizer_make_token(tokenizer, &token, Token_Kind_Eof, cursor, advance); + wchar c; + u8 nobytes; + (void)tokenizer_char(tokenizer, advance, &nobytes, &c); + tokenizer_advance(&advance, nobytes); + + if (utf8_is_identifier(c)) { + *out = tokenizer_lex_identifier(tokenizer, cursor, &advance); + goto out; + } + + switch (c) { + #define X(name, lex) \ + case lex: \ + *out = tokenizer_make_token(tokenizer, TOKEN_KIND(name), cursor, advance); \ + break; + + TOKEN_KINDS_SLEX; + #undef X + default: + *out = tokenizer_make_token(tokenizer, Token_Kind_Invalid_Char, cursor, + advance); + } - *out = token; +out: + Buffer* buffer = tokenizer->buffer; buffer->cursor = advance; return true; } diff --git a/src/utf8.cc b/src/utf8.cc index 255508e..ace1ec0 100644 --- a/src/utf8.cc +++ b/src/utf8.cc @@ -22,4 +22,14 @@ inline u8 utf8_nobytes(unsigned char c) { panic("what even is: %d\n", c); } +inline bool utf8_is_identifier(wchar c) { + return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); +} + +inline bool utf8_is_number(wchar c) { return ('0' <= c && c <= '9'); } + +inline bool utf8_is_alnum(wchar c) { + return utf8_is_identifier(c) || utf8_is_number(c); +} + #endif |
