summaryrefslogtreecommitdiff
path: root/src/tokenizer.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/tokenizer.cc')
-rw-r--r--src/tokenizer.cc58
1 files changed, 58 insertions, 0 deletions
diff --git a/src/tokenizer.cc b/src/tokenizer.cc
new file mode 100644
index 0000000..54634de
--- /dev/null
+++ b/src/tokenizer.cc
@@ -0,0 +1,58 @@
+#ifndef TOKENIZER_CC
+#define TOKENIZER_CC
+
+#include "common.cc"
+#include "source.cc"
+#include "utf8.cc"
+#include "token.cc"
+
+struct Tokenizer {
+ const Buffer* buffer;
+ Buffer_Stack* stack;
+
+ Tokenizer(Buffer_Stack* stack) : buffer(nullptr), stack(stack) {}
+};
+
+static inline const Buffer* tokenizer_get_buffer(Tokenizer* tokenizer) {
+ assert_neq(tokenizer, nullptr);
+
+ if(tokenizer->buffer != nullptr) return tokenizer->buffer;
+
+ Buffer* buffer = nullptr;
+ if(!buffer_stack_pop(tokenizer->stack, &buffer)) return nullptr;
+
+ tokenizer->buffer = buffer;
+ return buffer;
+}
+
+static inline char tokenizer_advance(const Tokenizer* tokenizer, usize* offset) {
+ const String text = tokenizer->buffer->content;
+
+ const unsigned char* c = text[*offset];
+ wchar wc = (wchar)*c;
+
+ u8 nobytes = utf8_nobytes(wc);
+ if(nobytes > 1) panic("no support for multi-byte chars: %d:%d", wc, nobytes);
+
+ offset += nobytes;
+ return (char)wc;
+}
+
+bool tokenizer_next(Tokenizer* tokenizer, Token* out) {
+ assert_neq(tokenizer, nullptr);
+ assert_neq(out, nullptr);
+
+again:
+ const Buffer* buffer = tokenizer_get_buffer(tokenizer);
+ if(buffer == nullptr) return false;
+
+ usize offset = buffer->cursor;
+ if(offset == buffer->content.length) {
+ tokenizer->buffer = nullptr;
+ goto again;
+ }
+
+ return false;
+}
+
+#endif