From 50f1d28328d6444d96995d12d6e164a5c242bd1e Mon Sep 17 00:00:00 2001 From: Matthias Kruk Date: Sat, 30 May 2020 18:24:34 +0900 Subject: [PATCH] tokenize: Implement recognition of identifiers, numeric literals, and several operators --- src/tokenize.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 152 insertions(+), 3 deletions(-) diff --git a/src/tokenize.c b/src/tokenize.c index 88371e2..9790f6d 100644 --- a/src/tokenize.c +++ b/src/tokenize.c @@ -26,9 +26,22 @@ #define STATE_STRING_ESC 15 #define STATE_CHR 16 #define STATE_CHR_ESC 17 +#define STATE_ID 18 +#define STATE_ADD 19 +#define STATE_SUB 20 +#define STATE_MUL 21 +#define STATE_DOT 22 +#define STATE_ZERO 23 +#define STATE_NUM 24 #define STATE_DONE 8 +#define identifier_firstchr(_c) ((_c) == '_' || \ + (_c) >= 'a' && (_c) <= 'z' || \ + (_c) >= 'A' && (_c) <= 'Z') +#define identifier_chr(_c) (identifier_firstchr(_c) || \ + (_c) >= '0' && (_c) <= '9') + static char _next = 0; static int _state = STATE_NONE; static int _line = 1; @@ -146,7 +159,40 @@ struct token *nexttoken(void) case '"': state = STATE_STRING; break; - + + case '\'': + state = STATE_CHR; + break; + + case '*': + state = STATE_MUL; + break; + + case '+': + state = STATE_ADD; + break; + + case '-': + state = STATE_SUB; + break; + + case '.': + state = STATE_DOT; + break; + + case '~': + case '(': + case ')': + case '{': + case '}': + case '[': + case ']': + case ':': + case ';': + case ',': + case '?': + return(token_new_from_char(_line, _col, c)); + case '\r': fprintf(stderr, "DOS user, eh?\n"); case ' ': @@ -155,8 +201,26 @@ struct token *nexttoken(void) break; default: + if(identifier_firstchr(c)) { + /* looks like an identifier */ + tok = token_new_from_char(_line, _col, c); + assert(tok); + state = STATE_ID; + break; + } else if(c == '0') { + tok = token_new_from_char(_line, _col, c); + assert(tok); + state = STATE_ZERO; + break; + } else if(c > '0' && c <= '9') { + tok = token_new_from_char(_line, _col, c); + assert(tok); + state = STATE_NUM; + break; + } + /* unrecognized token */ - fprintf(stderr, "Unrecognized token at %d:%d [%02x]\n", _line, _col, c); + fprintf(stderr, "Unrecognized token at %d:%d ['%c']\n", _line, _col, c); return(NULL); } @@ -316,7 +380,7 @@ struct token *nexttoken(void) } str_appendc(tok->value, c); - + if(c == '\\') { state = STATE_STRING_ESC; } else if(c == '"') { @@ -350,6 +414,91 @@ struct token *nexttoken(void) str_appendc(tok->value, c); state = STATE_CHR; break; + + case STATE_ID: + if(identifier_chr(c)) { + str_appendc(tok->value, c); + } else { + putnext(c); + return(tok); + } + + break; + + case STATE_MUL: + switch(c) { + case '=': + return(token_new2(_line, _col, "*=")); + + default: + putnext(c); + return(token_new2(_line, _col, "*")); + } + + case STATE_ADD: + switch(c) { + case '+': + return(token_new2(_line, _col, "++")); + + case '=': + return(token_new2(_line, _col, "+=")); + + default: + putnext(c); + return(token_new2(_line, _col, "+")); + } + + case STATE_SUB: + switch(c) { + case '-': + return(token_new2(_line, _col, "--")); + + case '=': + return(token_new2(_line, _col, "-=")); + + default: + putchar(c); + return(token_new2(_line, _col, "-")); + } + + case STATE_DOT: + putnext(c); + + if(c >= '0' && c <= '9') { + state = STATE_NUM; + } else { + return(token_new2(_line, _col, ".")); + } + + break; + + case STATE_ZERO: + if(c == 'x' || c == 'X') { + str_appendc(tok->value, 'x'); + state = STATE_NUM; + } else if(c >= '0' && c <= '9') { + putnext(c); + state = STATE_NUM; + } else if(c == '.') { + + } else { + putnext(c); + return(tok); + } + + break; + + case STATE_NUM: + /* FIXME: e, E may be in the middle, (u|U)(l|L|ll|LL) or vice versa may be at the end */ + + if(c >= '0' && c <= '9' || c == '.') { + str_appendc(tok->value, c); + } else { + putnext(c); + return(tok); + } + + break; } } -- 2.47.3