From: Matthias Kruk Date: Wed, 27 May 2020 00:34:05 +0000 (+0900) Subject: Add first attempt at writing a tokenizer X-Git-Url: https://git.corax.cc/?a=commitdiff_plain;h=a4dd66f99d9ed30d713d3c7c9b1fac3dbc53777e;p=ccc Add first attempt at writing a tokenizer --- diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..51280eb --- /dev/null +++ b/src/Makefile @@ -0,0 +1,13 @@ +OBJECTS = str.o token.o list.o tokenize.o +OUTPUT = tokenize +PHONY = clean + +all: $(OUTPUT) + +$(OUTPUT): $(OBJECTS) + $(CC) -std=c99 -Wall -pedantic -o $@ $^ + +clean: + rm -rf $(OUTPUT) $(OBJECTS) + +.PHONY: $(PHONY) diff --git a/src/list.c b/src/list.c new file mode 100644 index 0000000..e29af88 --- /dev/null +++ b/src/list.c @@ -0,0 +1,34 @@ +#include +#include +#include "list.h" + +list_t *list_new(void *data) +{ + list_t *l; + + l = malloc(sizeof(*l)); + + if(l) { + l->next = NULL; + l->data = data; + } + + return(l); +} + +void list_free(list_t *list) +{ + free(list); + return; +} + +int list_append(list_t **list, void *data) +{ + while(*list) { + list = &((*list)->next); + } + + *list = list_new(data); + + return(*list ? 0 : -ENOMEM); +} diff --git a/src/list.h b/src/list.h new file mode 100644 index 0000000..86858a0 --- /dev/null +++ b/src/list.h @@ -0,0 +1,16 @@ +#ifndef LIST_H +#define LIST_H + +typedef struct list list_t; + +struct list { + struct list *next; + void *data; +}; + +list_t *list_new(void*); +void list_free(list_t*); + +int list_append(list_t**, void*); + +#endif /* LIST_H */ diff --git a/src/str.c b/src/str.c new file mode 100644 index 0000000..c836a17 --- /dev/null +++ b/src/str.c @@ -0,0 +1,120 @@ +#include +#include +#include +#include +#include "str.h" + +struct str { + char *data; + size_t size; + size_t len; +}; + +#define STR_INITLEN 1024 + +str_t *str_new(void) +{ + str_t *s; + + s = malloc(sizeof(*s)); + + if(s) { + s->size = STR_INITLEN; + s->data = malloc(s->size + 1); + s->len = 0; + + if(s->data) { + memset(s->data, 0, s->size + 1); + } else { + free(s); + s = NULL; + } + } + + return(s); +} + +int _str_expand(str_t *str) +{ + char *ndata; + size_t nsize; + + nsize = str->size + STR_INITLEN; + ndata = malloc(nsize + 1); + + if(ndata) { + memset(ndata, 0, nsize + 1); + snprintf(ndata, nsize, "%s", str->data); + + free(str->data); + str->data = ndata; + str->size = nsize; + + return(0); + } + + return(-ENOMEM); +} + +int str_appendc(str_t *str, const char c) +{ + int ret_val; + + if(str->len == str->size) { + ret_val = _str_expand(str); + } else { + ret_val = 0; + } + + if(!ret_val) { + if(str->size > str->len) { + str->data[str->len++] = c; + } + } + + return(ret_val); +} + +int str_appends(str_t *str, const char *s) +{ + int i; + + for(i = 0; s[i]; i++) { + if(str_appendc(str, s[i]) < 0) { + break; + } + } + + return(i); +} + +const char* str_value(str_t *str) +{ + return(str->data); +} + +int str_set(str_t *str, const char *val) +{ + int ret_val; + int nlen; + + nlen = strlen(val); + + for(ret_val = 0; str->size < nlen; ) { + ret_val = _str_expand(str); + + if(ret_val < 0) { + goto gtfo; + } + } + + str->len = snprintf(str->data, str->size, "%s", val); + + if(str->len < 0) { + ret_val = -errno; + perror("snprintf"); + } + +gtfo: + return(ret_val); +} diff --git a/src/str.h b/src/str.h new file mode 100644 index 0000000..9d7b64b --- /dev/null +++ b/src/str.h @@ -0,0 +1,13 @@ +#ifndef STR_H +#define STR_H + +typedef struct str str_t; + +str_t *str_new(void); +int str_appendc(str_t*, const char); +int str_appends(str_t*, const char*); +void str_free(str_t*); +const char* str_value(str_t*); +int str_set(str_t*, const char*); + +#endif /* STR_H */ diff --git a/src/str_test.c b/src/str_test.c new file mode 100644 index 0000000..72eb6e5 --- /dev/null +++ b/src/str_test.c @@ -0,0 +1,15 @@ +#include +#include "str.h" + +int main(int argc, char *argv[]) +{ + str_t *str; + + str = str_alloc(); + + str_appends(str, "Hello, world"); + str_appendc(str, '!'); + + printf("%s\n", str_value(str)); + return(0); +} diff --git a/src/token.c b/src/token.c new file mode 100644 index 0000000..cafc7ac --- /dev/null +++ b/src/token.c @@ -0,0 +1,68 @@ +#include +#include +#include +#include +#include "token.h" +#include "str.h" + +struct token *token_new(void) +{ + struct token *tok; + + tok = malloc(sizeof(*tok)); + + if(tok) { + memset(tok, 0, sizeof(*tok)); + } + + return(tok); +} + +struct token *token_new2(const int line, const int col, const char *val) +{ + struct token *tok; + + tok = token_new(); + + if(tok) { + token_setpos(tok, line, col); + assert(token_setvalue(tok, val) == 0); + } + + return(tok); +} + +void token_free(struct token *tok) +{ + free(tok); + return; +} + +void token_setpos(struct token *tok, int line, int col) +{ + tok->line = line; + tok->column = col; + return; +} + +int token_setvalue(struct token *tok, const char *str) +{ + int ret_val; + + if(!tok->value) { + tok->value = str_new(); + } + + if(tok->value) { + ret_val = str_set(tok->value, str); + } else { + ret_val = -ENOMEM; + } + + return(ret_val); +} + +const char *token_getvalue(struct token *tok) +{ + return(str_value(tok->value)); +} diff --git a/src/token.h b/src/token.h new file mode 100644 index 0000000..a6cc972 --- /dev/null +++ b/src/token.h @@ -0,0 +1,19 @@ +#ifndef TOKEN_H +#define TOKEN_H + +#include "str.h" + +struct token { + int line; + int column; + str_t *value; + size_t len; +}; + +struct token *token_new2(const int, const int, const char*); +struct token *token_new(void); +void token_free(struct token*); +void token_setpos(struct token*, const int, const int); +int token_setvalue(struct token*, const char*); + +#endif /* TOKEN_H */ diff --git a/src/tokenize.c b/src/tokenize.c new file mode 100644 index 0000000..73fe060 --- /dev/null +++ b/src/tokenize.c @@ -0,0 +1,350 @@ +#include +#include +#include +#include +#include "str.h" +#include "list.h" +#include "token.h" + +#define TABWIDTH 8 +#define STATE_NONE 0 +#define STATE_COMMENT 1 +#define STATE_OP 2 +#define STATE_DIV 3 + +#define STATE_DONE 8 + +static char _next = 0; +static int _state = STATE_NONE; +static int _line = 1; +static int _col = 1; +static int _pline = 1; +static int _pcol = 1; +list_t *tokens = NULL; + +int getnext(void) +{ + int ret_val; + + if(_next) { + ret_val = _next; + _next = 0; + } else { + ret_val = getchar(); + + if(ret_val == EOF) { + ret_val = 0; + } + } + + switch(ret_val) { + case '\n': + _pline = _line; + _pcol = _col; + _line++; + _col = 1; + break; + + case '\t': + _pcol = _col; + _col += TABWIDTH; + break; + + default: + _pcol = _col; + _col++; + break; + + case 0: + break; + } + + return(ret_val); +} + +int putnext(const char c) +{ + int ret_val; + + if(_next) { + ret_val = -EALREADY; + } else { + _next = c; + ret_val = 0; + + _col = _pcol; + _line = _pline; + } + + return(ret_val); +} + +#if 0 +int comment(void) +{ + struct token *tok; + int cstate; + char c; + +#define CSTATE_HEAD 0 +#define CSTATE_BODY 1 +#define CSTATE_FOOT 2 +#define CSTATE_DONE 3 + + cstate = CSTATE_HEAD; + + tok = token_new(TOKEN_COMMENT); + assert(tok); + + token_setpos(tok, _line, _col); + + str_appendc(tok->value, '/'); + + while(cstate != CSTATE_DONE) { + c = getnext(); + + str_appendc(tok->value, c); + + switch(cstate) { + case CSTATE_HEAD: + assert(c == '*'); + cstate = CSTATE_BODY; + break; + + case CSTATE_BODY: + if(c == '*') { + cstate = CSTATE_FOOT; + } + break; + + case CSTATE_FOOT: + if(c == '/') { + cstate = CSTATE_DONE; + } else { + cstate = CSTATE_BODY; + } + + break; + + default: + assert(0); + break; + } + } + + list_append(&tokens, tok); + + return(0); +} + +int div(void) +{ + char c; + + c = getnext(); + +int none(void) +{ + char c; + int ret_val; + char lookahead; + + c = getnext(); + + if(!c) { + return(STATE_DONE); + } + + switch(c) { + case ' ': + case '\t': + case '\n': + ret_val = STATE_NONE; + break; + + case '/': + lookahead = getnext(); + + if(lookahead == '*') { + ret_val = STATE_COMMENT; + } else { + ret_val = STATE_DIV; + } + + putnext(lookahead); + + break; + + default: + ret_val = STATE_NONE; + break; +/* + case '"': + case '\'': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + ret_val = STATE_LITERAL; + break; + */ +/* + case '+': + case '-': + case '*': + case '=': + ret_val = STATE_OP; + break; +*/ +/* + case '<': + case '>': + case '?': + case ',': + case ';': + case ':': + case '!': + case '%': + case '&': + case '(': + case ')': + case '~': + case '^': + case '|': + case '[': + case ']': + case '{': + case '}': +*/ + } + + return(ret_val); +} +#endif /* 0 */ + +struct token *nexttoken(void) +{ + struct token *tok; + + int state; + + state = STATE_NONE; + tok = NULL; + + while(1) { + char c = getnext(); + + switch(state) { + case STATE_NONE: + switch(c) { + case '/': + state = STATE_DIV; + break; + + case '\r': + fprintf(stderr, "DOS user, eh?\n"); + case ' ': + case '\t': + case '\n': + break; + + default: + /* unrecognized token */ + fprintf(stderr, "Unrecognized token at %d:%d\n", _line, _col); + return(NULL); + } + + break; + + case STATE_DIV: + switch(c) { + case '*': + state = STATE_COMMENT; + break; + + case '=': + return(token_new2(_line, _col, "/=")); + break; + + default: + putnext(c); + return(token_new2(_line, _col, "/")); + } + + break; + + case STATE_COMMENT: + if(!tok) { + tok = token_new2(_line, _col, "/*"); + assert(tok); + } + + str_appendc(tok->value, c); + + if(c == '"') { + return(tok); + } + + break; + } + } + + return(NULL); +} + + + +int main(int argc, char *argv[]) +{ + int ret_val; + struct token *tok; + + ret_val = 0; + + while((tok = nexttoken())) { + printf("Token at %d:%d: %s\n", tok->line, tok->column, str_value(tok->value)); + } +#if 0 + while(_state != STATE_DONE) { + printf("_state = %d\n", _state); + + switch(_state) { + case STATE_NONE: + _state = none(); + break; + + case STATE_COMMENT: + _state = comment(); + break; + + case STATE_OP: + _state = op(); + break; + + default: + _state = STATE_DONE; + break; + } + } + + if(!tokens) { + printf("No tokens\n"); + } + + for(litem = tokens; litem; litem = litem->next) { + struct token *t; + + t = (struct token*)litem->data; + + printf("%s token at line %d:%d\n", token_typestr(t->type), t->line, t->column); + } +#endif + + return(ret_val); +}