summaryrefslogtreecommitdiff
path: root/src/tokenizer.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/tokenizer.cc')
-rw-r--r--src/tokenizer.cc78
1 files changed, 57 insertions, 21 deletions
diff --git a/src/tokenizer.cc b/src/tokenizer.cc
index f2efda4..acd9627 100644
--- a/src/tokenizer.cc
+++ b/src/tokenizer.cc
@@ -29,44 +29,68 @@ static inline String tokenizer_make_lexeme(const Tokenizer* tokenizer,
return String(buffer->content[start], end - start);
}
-static inline void tokenizer_make_token(const Tokenizer* tokenizer,
- Token* token, Token_Kind kind,
- usize start, usize end) {
+static inline Token tokenizer_make_token(const Tokenizer* tokenizer,
+ Token_Kind kind, usize start,
+ usize end) {
assert_neq(tokenizer, nullptr);
- assert_neq(token, nullptr);
String lexeme = tokenizer_make_lexeme(tokenizer, start, end);
Span span = tokenizer_make_span(tokenizer, start, end);
- *token = Token(kind, lexeme, span);
+ return Token(kind, lexeme, span);
}
-static Buffer* tokenizer_get_buffer(Tokenizer* tokenizer, usize* cursor) {
+static bool tokenizer_get_buffer(Tokenizer* tokenizer, usize* cursor) {
assert_neq(tokenizer, nullptr);
Buffer* curr = tokenizer->buffer;
if (likely(curr != nullptr)) {
*cursor = curr->cursor;
- if(*cursor < curr->content.length) return curr;
+ if (*cursor < curr->content.length) return true;
}
- if (!buffer_stack_pop(tokenizer->stack, &curr)) return nullptr;
+ if (!buffer_stack_pop(tokenizer->stack, &curr)) return false;
tokenizer->buffer = curr;
*cursor = curr->cursor;
- return curr;
+ return true;
}
-static char tokenizer_advance(const Tokenizer* tokenizer,
- usize* offset) {
+[[nodiscard]] static bool tokenizer_char(const Tokenizer* tokenizer,
+ usize offset, u8* out_nobytes,
+ wchar* out_char) {
const String text = tokenizer->buffer->content;
- unsigned char c = *text[*offset];
+ assert_ste(offset, text.length);
+ if (offset == text.length) return false;
+
+ unsigned char c = *text[offset];
u8 nobytes = utf8_nobytes(c);
if (nobytes > 1) panic("no support for multi-byte chars: %c:%d", c, nobytes);
- *offset += nobytes;
- return c;
+ *out_nobytes = nobytes;
+ *out_char = (wchar)c;
+ return true;
+}
+
+static void tokenizer_advance(usize* offset, usize nbytes) {
+ assert_neq(offset, nullptr);
+ *offset += nbytes;
+}
+
+static Token tokenizer_lex_identifier(Tokenizer* tokenizer, usize start,
+ usize* offset) {
+ assert_neq(tokenizer, nullptr);
+
+ wchar c;
+ u8 nobytes;
+
+ while (tokenizer_char(tokenizer, *offset, &nobytes, &c)) {
+ if (!utf8_is_alnum(c) && c != '_') break;
+ tokenizer_advance(offset, nobytes);
+ }
+
+ return tokenizer_make_token(tokenizer, Token_Kind_Identifier, start, *offset);
}
bool tokenizer_next(Tokenizer* tokenizer, Token* out) {
@@ -74,16 +98,28 @@ bool tokenizer_next(Tokenizer* tokenizer, Token* out) {
assert_neq(out, nullptr);
usize cursor;
- Buffer* buffer = tokenizer_get_buffer(tokenizer, &cursor);
- if (buffer == nullptr) return false;
+ if (unlikely(!tokenizer_get_buffer(tokenizer, &cursor))) return false;
usize advance = cursor;
- tokenizer_advance(tokenizer, &advance);
- Token token = {};
- tokenizer_make_token(tokenizer, &token, Token_Kind_Eof, cursor, advance);
+ wchar c;
+ u8 nobytes;
+
+ (void)tokenizer_char(tokenizer, advance, &nobytes, &c);
+ tokenizer_advance(&advance, nobytes);
+
+ if (utf8_is_identifier(c)) {
+ *out = tokenizer_lex_identifier(tokenizer, cursor, &advance);
+ goto out;
+ }
+
+ switch (c) {
+ default:
+ *out = tokenizer_make_token(tokenizer, Token_Kind_Invalid_Char, cursor,
+ advance);
+ }
- *out = token;
- buffer->cursor = advance;
+out:
+ tokenizer->buffer->cursor = advance;
return true;
}