]> git.corax.cc Git - ccc/commitdiff
Add first attempt at writing a tokenizer
authorMatthias Kruk <m@m10k.eu>
Wed, 27 May 2020 00:34:05 +0000 (09:34 +0900)
committerMatthias Kruk <m@m10k.eu>
Wed, 27 May 2020 00:34:05 +0000 (09:34 +0900)
src/Makefile [new file with mode: 0644]
src/list.c [new file with mode: 0644]
src/list.h [new file with mode: 0644]
src/str.c [new file with mode: 0644]
src/str.h [new file with mode: 0644]
src/str_test.c [new file with mode: 0644]
src/token.c [new file with mode: 0644]
src/token.h [new file with mode: 0644]
src/tokenize.c [new file with mode: 0644]

diff --git a/src/Makefile b/src/Makefile
new file mode 100644 (file)
index 0000000..51280eb
--- /dev/null
@@ -0,0 +1,13 @@
+OBJECTS = str.o token.o list.o tokenize.o
+OUTPUT = tokenize
+PHONY = clean
+
+all: $(OUTPUT)
+
+$(OUTPUT): $(OBJECTS)
+       $(CC) -std=c99 -Wall -pedantic -o $@ $^
+
+clean:
+       rm -rf $(OUTPUT) $(OBJECTS)
+
+.PHONY: $(PHONY)
diff --git a/src/list.c b/src/list.c
new file mode 100644 (file)
index 0000000..e29af88
--- /dev/null
@@ -0,0 +1,34 @@
+#include <stdlib.h>
+#include <errno.h>
+#include "list.h"
+
+list_t *list_new(void *data)
+{
+       list_t *l;
+
+       l = malloc(sizeof(*l));
+
+       if(l) {
+               l->next = NULL;
+               l->data = data;
+       }
+
+       return(l);
+}
+
+void list_free(list_t *list)
+{
+       free(list);
+       return;
+}
+
+int list_append(list_t **list, void *data)
+{
+       while(*list) {
+               list = &((*list)->next);
+       }
+
+       *list = list_new(data);
+
+       return(*list ? 0 : -ENOMEM);
+}
diff --git a/src/list.h b/src/list.h
new file mode 100644 (file)
index 0000000..86858a0
--- /dev/null
@@ -0,0 +1,16 @@
+#ifndef LIST_H
+#define LIST_H
+
+typedef struct list list_t;
+
+struct list {
+       struct list *next;
+       void *data;
+};
+
+list_t *list_new(void*);
+void list_free(list_t*);
+
+int list_append(list_t**, void*);
+
+#endif /* LIST_H */
diff --git a/src/str.c b/src/str.c
new file mode 100644 (file)
index 0000000..c836a17
--- /dev/null
+++ b/src/str.c
@@ -0,0 +1,120 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+#include "str.h"
+
+struct str {
+       char *data;
+       size_t size;
+       size_t len;
+};
+
+#define STR_INITLEN 1024
+
+str_t *str_new(void)
+{
+       str_t *s;
+
+       s = malloc(sizeof(*s));
+
+       if(s) {
+               s->size = STR_INITLEN;
+               s->data = malloc(s->size + 1);
+               s->len = 0;
+
+               if(s->data) {
+                       memset(s->data, 0, s->size + 1);
+               } else {
+                       free(s);
+                       s = NULL;
+               }
+       }
+
+       return(s);
+}
+
+int _str_expand(str_t *str)
+{
+       char *ndata;
+       size_t nsize;
+
+       nsize = str->size + STR_INITLEN;
+       ndata = malloc(nsize + 1);
+
+       if(ndata) {
+               memset(ndata, 0, nsize + 1);
+               snprintf(ndata, nsize, "%s", str->data);
+
+               free(str->data);
+               str->data = ndata;
+               str->size = nsize;
+
+               return(0);
+       }
+
+       return(-ENOMEM);
+}
+
+int str_appendc(str_t *str, const char c)
+{
+       int ret_val;
+
+       if(str->len == str->size) {
+               ret_val = _str_expand(str);
+       } else {
+               ret_val = 0;
+       }
+
+       if(!ret_val) {
+               if(str->size > str->len) {
+                       str->data[str->len++] = c;
+               }
+       }
+
+       return(ret_val);
+}
+
+int str_appends(str_t *str, const char *s)
+{
+       int i;
+
+       for(i = 0; s[i]; i++) {
+               if(str_appendc(str, s[i]) < 0) {
+                       break;
+               }
+       }
+
+       return(i);
+}
+
+const char* str_value(str_t *str)
+{
+       return(str->data);
+}
+
+int str_set(str_t *str, const char *val)
+{
+       int ret_val;
+       int nlen;
+
+       nlen = strlen(val);
+
+       for(ret_val = 0; str->size < nlen; ) {
+               ret_val = _str_expand(str);
+
+               if(ret_val < 0) {
+                       goto gtfo;
+               }
+       }
+
+       str->len = snprintf(str->data, str->size, "%s", val);
+
+       if(str->len < 0) {
+               ret_val = -errno;
+               perror("snprintf");
+       }
+
+gtfo:
+       return(ret_val);
+}
diff --git a/src/str.h b/src/str.h
new file mode 100644 (file)
index 0000000..9d7b64b
--- /dev/null
+++ b/src/str.h
@@ -0,0 +1,13 @@
+#ifndef STR_H
+#define STR_H
+
+typedef struct str str_t;
+
+str_t *str_new(void);
+int str_appendc(str_t*, const char);
+int str_appends(str_t*, const char*);
+void str_free(str_t*);
+const char* str_value(str_t*);
+int str_set(str_t*, const char*);
+
+#endif /* STR_H */
diff --git a/src/str_test.c b/src/str_test.c
new file mode 100644 (file)
index 0000000..72eb6e5
--- /dev/null
@@ -0,0 +1,15 @@
+#include <stdio.h>
+#include "str.h"
+
+int main(int argc, char *argv[])
+{
+       str_t *str;
+
+       str = str_alloc();
+
+       str_appends(str, "Hello, world");
+       str_appendc(str, '!');
+
+       printf("%s\n", str_value(str));
+       return(0);
+}
diff --git a/src/token.c b/src/token.c
new file mode 100644 (file)
index 0000000..cafc7ac
--- /dev/null
@@ -0,0 +1,68 @@
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <errno.h>
+#include "token.h"
+#include "str.h"
+
+struct token *token_new(void)
+{
+       struct token *tok;
+
+       tok = malloc(sizeof(*tok));
+
+       if(tok) {
+               memset(tok, 0, sizeof(*tok));
+       }
+
+       return(tok);
+}
+
+struct token *token_new2(const int line, const int col, const char *val)
+{
+       struct token *tok;
+
+       tok = token_new();
+
+       if(tok) {
+               token_setpos(tok, line, col);
+               assert(token_setvalue(tok, val) == 0);
+       }
+
+       return(tok);
+}
+
+void token_free(struct token *tok)
+{
+       free(tok);
+       return;
+}
+
+void token_setpos(struct token *tok, int line, int col)
+{
+       tok->line = line;
+       tok->column = col;
+       return;
+}
+
+int token_setvalue(struct token *tok, const char *str)
+{
+       int ret_val;
+
+       if(!tok->value) {
+               tok->value = str_new();
+       }
+
+       if(tok->value) {
+               ret_val = str_set(tok->value, str);
+       } else {
+               ret_val = -ENOMEM;
+       }
+
+       return(ret_val);
+}
+
+const char *token_getvalue(struct token *tok)
+{
+       return(str_value(tok->value));
+}
diff --git a/src/token.h b/src/token.h
new file mode 100644 (file)
index 0000000..a6cc972
--- /dev/null
@@ -0,0 +1,19 @@
+#ifndef TOKEN_H
+#define TOKEN_H
+
+#include "str.h"
+
+struct token {
+       int line;
+       int column;
+       str_t *value;
+       size_t len;
+};
+
+struct token *token_new2(const int, const int, const char*);
+struct token *token_new(void);
+void token_free(struct token*);
+void token_setpos(struct token*, const int, const int);
+int token_setvalue(struct token*, const char*);
+
+#endif /* TOKEN_H */
diff --git a/src/tokenize.c b/src/tokenize.c
new file mode 100644 (file)
index 0000000..73fe060
--- /dev/null
@@ -0,0 +1,350 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <assert.h>
+#include "str.h"
+#include "list.h"
+#include "token.h"
+
+#define TABWIDTH      8
+#define STATE_NONE    0
+#define STATE_COMMENT 1
+#define STATE_OP      2
+#define STATE_DIV     3
+
+#define STATE_DONE    8
+
+static char _next = 0;
+static int _state = STATE_NONE;
+static int _line = 1;
+static int _col = 1;
+static int _pline = 1;
+static int _pcol = 1;
+list_t *tokens = NULL;
+
+int getnext(void)
+{
+       int ret_val;
+
+       if(_next) {
+               ret_val = _next;
+               _next = 0;
+       } else {
+               ret_val = getchar();
+
+               if(ret_val == EOF) {
+                       ret_val = 0;
+               }
+       }
+
+       switch(ret_val) {
+       case '\n':
+               _pline = _line;
+               _pcol = _col;
+               _line++;
+               _col = 1;
+               break;
+
+       case '\t':
+               _pcol = _col;
+               _col += TABWIDTH;
+               break;
+
+       default:
+               _pcol = _col;
+               _col++;
+               break;
+
+       case 0:
+               break;
+       }
+
+       return(ret_val);
+}
+
+int putnext(const char c)
+{
+       int ret_val;
+
+       if(_next) {
+               ret_val = -EALREADY;
+       } else {
+               _next = c;
+               ret_val = 0;
+
+               _col = _pcol;
+               _line = _pline;
+       }
+
+       return(ret_val);
+}
+
+#if 0
+int comment(void)
+{
+       struct token *tok;
+       int cstate;
+       char c;
+
+#define CSTATE_HEAD 0
+#define CSTATE_BODY 1
+#define CSTATE_FOOT 2
+#define CSTATE_DONE 3
+
+       cstate = CSTATE_HEAD;
+
+       tok = token_new(TOKEN_COMMENT);
+       assert(tok);
+
+       token_setpos(tok, _line, _col);
+
+       str_appendc(tok->value, '/');
+
+       while(cstate != CSTATE_DONE) {
+               c = getnext();
+
+               str_appendc(tok->value, c);
+
+               switch(cstate) {
+               case CSTATE_HEAD:
+                       assert(c == '*');
+                       cstate = CSTATE_BODY;
+                       break;
+
+               case CSTATE_BODY:
+                       if(c == '*') {
+                               cstate = CSTATE_FOOT;
+                       }
+                       break;
+
+               case CSTATE_FOOT:
+                       if(c == '/') {
+                               cstate = CSTATE_DONE;
+                       } else {
+                               cstate = CSTATE_BODY;
+                       }
+
+                       break;
+
+               default:
+                       assert(0);
+                       break;
+               }
+       }
+
+       list_append(&tokens, tok);
+
+       return(0);
+}
+
+int div(void)
+{
+       char c;
+
+       c = getnext();
+
+int none(void)
+{
+       char c;
+       int ret_val;
+       char lookahead;
+
+       c = getnext();
+
+       if(!c) {
+               return(STATE_DONE);
+       }
+
+       switch(c) {
+       case ' ':
+       case '\t':
+       case '\n':
+               ret_val = STATE_NONE;
+               break;
+
+       case '/':
+               lookahead = getnext();
+
+               if(lookahead == '*') {
+                       ret_val = STATE_COMMENT;
+               } else {
+                       ret_val = STATE_DIV;
+               }
+
+               putnext(lookahead);
+
+               break;
+
+       default:
+               ret_val = STATE_NONE;
+               break;
+/*
+       case '"':
+       case '\'':
+       case '0':
+       case '1':
+       case '2':
+       case '3':
+       case '4':
+       case '5':
+       case '6':
+       case '7':
+       case '8':
+       case '9':
+               ret_val = STATE_LITERAL;
+               break;
+               */
+/*
+       case '+':
+       case '-':
+       case '*':
+       case '=':
+               ret_val = STATE_OP;
+               break;
+*/
+/*
+       case '<':
+       case '>':
+       case '?':
+       case ',':
+       case ';':
+       case ':':
+       case '!':
+       case '%':
+       case '&':
+       case '(':
+       case ')':
+       case '~':
+       case '^':
+       case '|':
+       case '[':
+       case ']':
+       case '{':
+       case '}':
+*/
+       }
+
+       return(ret_val);
+}
+#endif /* 0 */
+
+struct token *nexttoken(void)
+{
+       struct token *tok;
+
+       int state;
+
+       state = STATE_NONE;
+       tok = NULL;
+
+       while(1) {
+               char c = getnext();
+
+               switch(state) {
+               case STATE_NONE:
+                       switch(c) {
+                       case '/':
+                               state = STATE_DIV;
+                               break;
+
+                       case '\r':
+                               fprintf(stderr, "DOS user, eh?\n");
+                       case ' ':
+                       case '\t':
+                       case '\n':
+                               break;
+
+                       default:
+                               /* unrecognized token */
+                               fprintf(stderr, "Unrecognized token at %d:%d\n", _line, _col);
+                               return(NULL);
+                       }
+
+                       break;
+
+               case STATE_DIV:
+                       switch(c) {
+                       case '*':
+                               state = STATE_COMMENT;
+                               break;
+
+                       case '=':
+                               return(token_new2(_line, _col, "/="));
+                               break;
+
+                       default:
+                               putnext(c);
+                               return(token_new2(_line, _col, "/"));
+                       }
+
+                       break;
+
+               case STATE_COMMENT:
+                       if(!tok) {
+                               tok = token_new2(_line, _col, "/*");
+                               assert(tok);
+                       }
+
+                       str_appendc(tok->value, c);
+
+                       if(c == '"') {
+                               return(tok);
+                       }
+
+                       break;
+               }
+       }
+
+       return(NULL);
+}
+
+
+
+int main(int argc, char *argv[])
+{
+       int ret_val;
+       struct token *tok;
+
+       ret_val = 0;
+
+       while((tok = nexttoken())) {
+               printf("Token at %d:%d: %s\n", tok->line, tok->column, str_value(tok->value));
+       }
+#if 0
+       while(_state != STATE_DONE) {
+               printf("_state = %d\n", _state);
+
+               switch(_state) {
+               case STATE_NONE:
+                       _state = none();
+                       break;
+
+               case STATE_COMMENT:
+                       _state = comment();
+                       break;
+
+               case STATE_OP:
+                       _state = op();
+                       break;
+
+               default:
+                       _state = STATE_DONE;
+                       break;
+               }
+       }
+
+       if(!tokens) {
+               printf("No tokens\n");
+       }
+
+       for(litem = tokens; litem; litem = litem->next) {
+               struct token *t;
+
+               t = (struct token*)litem->data;
+
+               printf("%s token at line %d:%d\n", token_typestr(t->type), t->line, t->column);
+       }
+#endif
+
+       return(ret_val);
+}