1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
#ifndef TOKENIZER_CC
#define TOKENIZER_CC
#include "common.cc"
#include "source.cc"
#include "token.cc"
#include "utf8.cc"
struct Tokenizer {
Buffer* buffer;
Buffer_Stack* stack;
Tokenizer(Buffer_Stack* stack) : buffer(nullptr), stack(stack) {}
};
static inline Span tokenizer_make_span(const Tokenizer* tokenizer, usize start,
usize end) {
assert_neq(tokenizer, nullptr);
Buffer* buffer = tokenizer->buffer;
return Span(buffer->file, start, end);
}
static inline String tokenizer_make_lexeme(const Tokenizer* tokenizer,
usize start, usize end) {
assert_neq(tokenizer, nullptr);
Buffer* buffer = tokenizer->buffer;
return String(buffer->content[start], end - start);
}
static inline Token tokenizer_make_token(const Tokenizer* tokenizer,
Token_Kind kind, usize start,
usize end) {
assert_neq(tokenizer, nullptr);
String lexeme = tokenizer_make_lexeme(tokenizer, start, end);
Span span = tokenizer_make_span(tokenizer, start, end);
return Token(kind, lexeme, span);
}
static bool tokenizer_get_buffer(Tokenizer* tokenizer, usize* cursor) {
assert_neq(tokenizer, nullptr);
Buffer* curr = tokenizer->buffer;
if (likely(curr != nullptr)) {
*cursor = curr->cursor;
if (*cursor < curr->content.length) return true;
}
if (!buffer_stack_pop(tokenizer->stack, &curr)) return false;
tokenizer->buffer = curr;
*cursor = curr->cursor;
return true;
}
[[nodiscard]] static bool tokenizer_char(const Tokenizer* tokenizer,
usize offset, u8* out_nobytes,
wchar* out_char) {
const String text = tokenizer->buffer->content;
assert_ste(offset, text.length);
if (offset == text.length) return false;
unsigned char c = *text[offset];
u8 nobytes = utf8_nobytes(c);
if (nobytes > 1) panic("no support for multi-byte chars: %c:%d", c, nobytes);
*out_nobytes = nobytes;
*out_char = (wchar)c;
return true;
}
static void tokenizer_advance(usize* offset, usize nbytes) {
assert_neq(offset, nullptr);
*offset += nbytes;
}
static Token tokenizer_lex_identifier(Tokenizer* tokenizer, usize start,
usize* offset) {
assert_neq(tokenizer, nullptr);
wchar c;
u8 nobytes;
while (tokenizer_char(tokenizer, *offset, &nobytes, &c)) {
if (!utf8_is_alnum(c) && c != '_') break;
tokenizer_advance(offset, nobytes);
}
return tokenizer_make_token(tokenizer, Token_Kind_Identifier, start, *offset);
}
bool tokenizer_next(Tokenizer* tokenizer, Token* out) {
assert_neq(tokenizer, nullptr);
assert_neq(out, nullptr);
usize cursor;
if (unlikely(!tokenizer_get_buffer(tokenizer, &cursor))) return false;
usize advance = cursor;
wchar c;
u8 nobytes;
(void)tokenizer_char(tokenizer, advance, &nobytes, &c);
tokenizer_advance(&advance, nobytes);
if (utf8_is_identifier(c)) {
*out = tokenizer_lex_identifier(tokenizer, cursor, &advance);
goto out;
}
switch (c) {
default:
*out = tokenizer_make_token(tokenizer, Token_Kind_Invalid_Char, cursor,
advance);
}
out:
tokenizer->buffer->cursor = advance;
return true;
}
#endif
|