From: Matthias Kruk Date: Sun, 31 May 2020 03:49:36 +0000 (+0900) Subject: Rename tokenize.c to lex.c X-Git-Url: https://git.corax.cc/?a=commitdiff_plain;h=87d69e4b5c6378999aa682a4148f6f13719a0e82;p=ccc Rename tokenize.c to lex.c --- diff --git a/src/Makefile b/src/Makefile index 51280eb..9560990 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,5 +1,5 @@ -OBJECTS = str.o token.o list.o tokenize.o -OUTPUT = tokenize +OBJECTS = str.o token.o list.o lex.o +OUTPUT = c3 PHONY = clean all: $(OUTPUT) diff --git a/src/lex.c b/src/lex.c new file mode 100644 index 0000000..c70f5e3 --- /dev/null +++ b/src/lex.c @@ -0,0 +1,522 @@ +#include +#include +#include +#include +#include "str.h" +#include "list.h" +#include "token.h" + +#define TABWIDTH 8 + +#define STATE_NONE 0 +#define STATE_COMMENT 1 +#define STATE_COMMENT_END 2 +#define STATE_DIV 3 +#define STATE_LT 4 +#define STATE_SHL 5 +#define STATE_GT 6 +#define STATE_SHR 7 +#define STATE_NOT 8 +#define STATE_MOD 9 +#define STATE_AND 10 +#define STATE_OR 11 +#define STATE_XOR 12 +#define STATE_ASSIGN 13 +#define STATE_STRING 14 +#define STATE_STRING_ESC 15 +#define STATE_CHR 16 +#define STATE_CHR_ESC 17 +#define STATE_ID 18 +#define STATE_ADD 19 +#define STATE_SUB 20 +#define STATE_MUL 21 +#define STATE_DOT 22 +#define STATE_ZERO 23 +#define STATE_NUM 24 + +#define STATE_DONE 8 + +#define identifier_firstchr(_c) ((_c) == '_' || \ + (_c) >= 'a' && (_c) <= 'z' || \ + (_c) >= 'A' && (_c) <= 'Z') +#define identifier_chr(_c) (identifier_firstchr(_c) || \ + (_c) >= '0' && (_c) <= '9') + +static char _next = 0; +static int _state = STATE_NONE; +static int _line = 1; +static int _col = 0; +static int _pline = 1; +static int _pcol = 1; +list_t *tokens = NULL; + +int getnext(void) +{ + int ret_val; + + if(_next) { + ret_val = _next; + _next = 0; + } else { + ret_val = getchar(); + + if(ret_val == EOF) { + ret_val = 0; + } + } + + switch(ret_val) { + case '\n': + _pline = _line; + _pcol = _col; + _line++; + _col = 0; + break; + + case '\t': + _pcol = _col; + _col += TABWIDTH; + break; + + default: + _pcol = _col; + _col++; + break; + + case 0: + break; + } + + return(ret_val); +} + +int putnext(const char c) +{ + int ret_val; + + if(_next) { + ret_val = -EALREADY; + } else { + _next = c; + ret_val = 0; + + _col = _pcol; + _line = _pline; + } + + return(ret_val); +} + +struct token *nexttoken(void) +{ + struct token *tok; + int state; + char c; + + state = STATE_NONE; + tok = NULL; + + while((c = getnext())) { + switch(state) { + case STATE_NONE: + switch(c) { + case '/': + state = STATE_DIV; + break; + + case '<': + state = STATE_LT; + break; + + case '>': + state = STATE_GT; + break; + + case '!': + state = STATE_NOT; + break; + + case '%': + state = STATE_MOD; + break; + + case '&': + state = STATE_AND; + break; + + case '|': + state = STATE_OR; + break; + + case '^': + state = STATE_XOR; + break; + + case '=': + state = STATE_ASSIGN; + break; + + case '"': + state = STATE_STRING; + break; + + case '\'': + state = STATE_CHR; + break; + + case '*': + state = STATE_MUL; + break; + + case '+': + state = STATE_ADD; + break; + + case '-': + state = STATE_SUB; + break; + + case '.': + state = STATE_DOT; + break; + + case '~': + case '(': + case ')': + case '{': + case '}': + case '[': + case ']': + case ':': + case ';': + case ',': + case '?': + return(token_new_from_char(_line, _col, c)); + + case '\r': + fprintf(stderr, "DOS user, eh?\n"); + case ' ': + case '\t': + case '\n': + break; + + default: + if(identifier_firstchr(c)) { + /* looks like an identifier */ + tok = token_new_from_char(_line, _col, c); + assert(tok); + state = STATE_ID; + break; + } else if(c == '0') { + tok = token_new_from_char(_line, _col, c); + assert(tok); + state = STATE_ZERO; + break; + } else if(c > '0' && c <= '9') { + tok = token_new_from_char(_line, _col, c); + assert(tok); + state = STATE_NUM; + break; + } + + /* unrecognized token */ + fprintf(stderr, "Unrecognized token at %d:%d ['%c']\n", _line, _col, c); + return(NULL); + } + + break; + + case STATE_DIV: + switch(c) { + case '*': + state = STATE_COMMENT; + break; + + case '=': + return(token_new2(_line, _col, "/=")); + + default: + putnext(c); + return(token_new2(_line, _col, "/")); + } + + break; + + case STATE_COMMENT: + if(!tok) { + tok = token_new2(_line, _col, "/*"); + assert(tok); + } + + str_appendc(tok->value, c); + + if(c == '*') { + state = STATE_COMMENT_END; + } + + break; + + case STATE_COMMENT_END: + str_appendc(tok->value, c); + + if(c == '/') { + return(tok); + } + + state = STATE_COMMENT; + break; + + case STATE_LT: + switch(c) { + case '<': + state = STATE_SHL; + break; + + case '=': + return(token_new2(_line, _col, "<=")); + break; + + default: + putnext(c); + return(token_new2(_line, _col, "<")); + } + + break; + + case STATE_SHL: + if(c == '=') { + return(token_new2(_line, _col, "<<=")); + } + + putnext(c); + return(token_new2(_line, _col, "<<")); + + case STATE_GT: + switch(c) { + case '>': + state = STATE_SHR; + break; + + case '=': + return(token_new2(_line, _col, ">=")); + + default: + putnext(c); + return(token_new2(_line, _col, ">")); + } + + break; + + case STATE_SHR: + if(c == '=') { + return(token_new2(_line, _col, ">>=")); + } + + putnext(c); + return(token_new2(_line, _col, ">>")); + + case STATE_NOT: + if(c == '=') { + return(token_new2(_line, _col, "!=")); + } + + putnext(c); + return(token_new2(_line, _col, "!")); + + case STATE_MOD: + if(c == '=') { + return(token_new2(_line, _col, "%=")); + } + + putnext(c); + return(token_new2(_line, _col, "%")); + + case STATE_AND: + switch(c) { + case '&': + return(token_new2(_line, _col, "&&")); + + case '=': + return(token_new2(_line, _col, "&=")); + + default: + putnext(c); + return(token_new2(_line, _col, "&")); + } + + case STATE_OR: + switch(c) { + case '|': + return(token_new2(_line, _col, "||")); + + case '=': + return(token_new2(_line, _col, "|=")); + + default: + putnext(c); + return(token_new2(_line, _col, "|")); + } + + case STATE_XOR: + if(c == '=') { + return(token_new2(_line, _col, "^=")); + } + + putnext(c); + return(token_new2(_line, _col, "^")); + + case STATE_ASSIGN: + if(c == '=') { + return(token_new2(_line, _col, "==")); + } + + putnext(c); + return(token_new2(_line, _col, "=")); + + case STATE_STRING: + if(!tok) { + tok = token_new2(_line, _col, "\""); + assert(tok); + } + + str_appendc(tok->value, c); + + if(c == '\\') { + state = STATE_STRING_ESC; + } else if(c == '"') { + return(tok); + } + + break; + + case STATE_STRING_ESC: + str_appendc(tok->value, c); + state = STATE_STRING; + break; + + case STATE_CHR: + if(!tok) { + tok = token_new2(_line, _col, "'"); + assert(tok); + } + + str_appendc(tok->value, c); + + if(c == '\\') { + state = STATE_CHR_ESC; + } else if(c == '\'') { + return(tok); + } + + break; + + case STATE_CHR_ESC: + str_appendc(tok->value, c); + state = STATE_CHR; + break; + + case STATE_ID: + if(identifier_chr(c)) { + str_appendc(tok->value, c); + } else { + putnext(c); + return(tok); + } + + break; + + case STATE_MUL: + switch(c) { + case '=': + return(token_new2(_line, _col, "*=")); + + default: + putnext(c); + return(token_new2(_line, _col, "*")); + } + + case STATE_ADD: + switch(c) { + case '+': + return(token_new2(_line, _col, "++")); + + case '=': + return(token_new2(_line, _col, "+=")); + + default: + putnext(c); + return(token_new2(_line, _col, "+")); + } + + case STATE_SUB: + switch(c) { + case '-': + return(token_new2(_line, _col, "--")); + + case '=': + return(token_new2(_line, _col, "-=")); + + case '>': + return(token_new2(_line, _col, "->")); + + default: + putnext(c); + return(token_new2(_line, _col, "-")); + } + + case STATE_DOT: + putnext(c); + + if(c >= '0' && c <= '9') { + state = STATE_NUM; + } else { + return(token_new2(_line, _col, ".")); + } + + break; + + case STATE_ZERO: + if(c == 'x' || c == 'X') { + str_appendc(tok->value, 'x'); + state = STATE_NUM; + } else if(c >= '0' && c <= '9') { + putnext(c); + state = STATE_NUM; + } else if(c == '.') { + + } else { + putnext(c); + return(tok); + } + + break; + + case STATE_NUM: + /* FIXME: e, E may be in the middle, (u|U)(l|L|ll|LL) or vice versa may be at the end */ + if(c >= '0' && c <= '9' || c == '.') { + str_appendc(tok->value, c); + } else { + putnext(c); + return(tok); + } + + break; + } + } + + return(tok); +} + +int main(int argc, char *argv[]) +{ + int ret_val; + struct token *tok; + + ret_val = 0; + + while((tok = nexttoken())) { + printf("Token at %4d:%3d: \"%s\"\n", tok->line, tok->column, str_value(tok->value)); + } + + return(ret_val); +} diff --git a/src/tokenize.c b/src/tokenize.c deleted file mode 100644 index c70f5e3..0000000 --- a/src/tokenize.c +++ /dev/null @@ -1,522 +0,0 @@ -#include -#include -#include -#include -#include "str.h" -#include "list.h" -#include "token.h" - -#define TABWIDTH 8 - -#define STATE_NONE 0 -#define STATE_COMMENT 1 -#define STATE_COMMENT_END 2 -#define STATE_DIV 3 -#define STATE_LT 4 -#define STATE_SHL 5 -#define STATE_GT 6 -#define STATE_SHR 7 -#define STATE_NOT 8 -#define STATE_MOD 9 -#define STATE_AND 10 -#define STATE_OR 11 -#define STATE_XOR 12 -#define STATE_ASSIGN 13 -#define STATE_STRING 14 -#define STATE_STRING_ESC 15 -#define STATE_CHR 16 -#define STATE_CHR_ESC 17 -#define STATE_ID 18 -#define STATE_ADD 19 -#define STATE_SUB 20 -#define STATE_MUL 21 -#define STATE_DOT 22 -#define STATE_ZERO 23 -#define STATE_NUM 24 - -#define STATE_DONE 8 - -#define identifier_firstchr(_c) ((_c) == '_' || \ - (_c) >= 'a' && (_c) <= 'z' || \ - (_c) >= 'A' && (_c) <= 'Z') -#define identifier_chr(_c) (identifier_firstchr(_c) || \ - (_c) >= '0' && (_c) <= '9') - -static char _next = 0; -static int _state = STATE_NONE; -static int _line = 1; -static int _col = 0; -static int _pline = 1; -static int _pcol = 1; -list_t *tokens = NULL; - -int getnext(void) -{ - int ret_val; - - if(_next) { - ret_val = _next; - _next = 0; - } else { - ret_val = getchar(); - - if(ret_val == EOF) { - ret_val = 0; - } - } - - switch(ret_val) { - case '\n': - _pline = _line; - _pcol = _col; - _line++; - _col = 0; - break; - - case '\t': - _pcol = _col; - _col += TABWIDTH; - break; - - default: - _pcol = _col; - _col++; - break; - - case 0: - break; - } - - return(ret_val); -} - -int putnext(const char c) -{ - int ret_val; - - if(_next) { - ret_val = -EALREADY; - } else { - _next = c; - ret_val = 0; - - _col = _pcol; - _line = _pline; - } - - return(ret_val); -} - -struct token *nexttoken(void) -{ - struct token *tok; - int state; - char c; - - state = STATE_NONE; - tok = NULL; - - while((c = getnext())) { - switch(state) { - case STATE_NONE: - switch(c) { - case '/': - state = STATE_DIV; - break; - - case '<': - state = STATE_LT; - break; - - case '>': - state = STATE_GT; - break; - - case '!': - state = STATE_NOT; - break; - - case '%': - state = STATE_MOD; - break; - - case '&': - state = STATE_AND; - break; - - case '|': - state = STATE_OR; - break; - - case '^': - state = STATE_XOR; - break; - - case '=': - state = STATE_ASSIGN; - break; - - case '"': - state = STATE_STRING; - break; - - case '\'': - state = STATE_CHR; - break; - - case '*': - state = STATE_MUL; - break; - - case '+': - state = STATE_ADD; - break; - - case '-': - state = STATE_SUB; - break; - - case '.': - state = STATE_DOT; - break; - - case '~': - case '(': - case ')': - case '{': - case '}': - case '[': - case ']': - case ':': - case ';': - case ',': - case '?': - return(token_new_from_char(_line, _col, c)); - - case '\r': - fprintf(stderr, "DOS user, eh?\n"); - case ' ': - case '\t': - case '\n': - break; - - default: - if(identifier_firstchr(c)) { - /* looks like an identifier */ - tok = token_new_from_char(_line, _col, c); - assert(tok); - state = STATE_ID; - break; - } else if(c == '0') { - tok = token_new_from_char(_line, _col, c); - assert(tok); - state = STATE_ZERO; - break; - } else if(c > '0' && c <= '9') { - tok = token_new_from_char(_line, _col, c); - assert(tok); - state = STATE_NUM; - break; - } - - /* unrecognized token */ - fprintf(stderr, "Unrecognized token at %d:%d ['%c']\n", _line, _col, c); - return(NULL); - } - - break; - - case STATE_DIV: - switch(c) { - case '*': - state = STATE_COMMENT; - break; - - case '=': - return(token_new2(_line, _col, "/=")); - - default: - putnext(c); - return(token_new2(_line, _col, "/")); - } - - break; - - case STATE_COMMENT: - if(!tok) { - tok = token_new2(_line, _col, "/*"); - assert(tok); - } - - str_appendc(tok->value, c); - - if(c == '*') { - state = STATE_COMMENT_END; - } - - break; - - case STATE_COMMENT_END: - str_appendc(tok->value, c); - - if(c == '/') { - return(tok); - } - - state = STATE_COMMENT; - break; - - case STATE_LT: - switch(c) { - case '<': - state = STATE_SHL; - break; - - case '=': - return(token_new2(_line, _col, "<=")); - break; - - default: - putnext(c); - return(token_new2(_line, _col, "<")); - } - - break; - - case STATE_SHL: - if(c == '=') { - return(token_new2(_line, _col, "<<=")); - } - - putnext(c); - return(token_new2(_line, _col, "<<")); - - case STATE_GT: - switch(c) { - case '>': - state = STATE_SHR; - break; - - case '=': - return(token_new2(_line, _col, ">=")); - - default: - putnext(c); - return(token_new2(_line, _col, ">")); - } - - break; - - case STATE_SHR: - if(c == '=') { - return(token_new2(_line, _col, ">>=")); - } - - putnext(c); - return(token_new2(_line, _col, ">>")); - - case STATE_NOT: - if(c == '=') { - return(token_new2(_line, _col, "!=")); - } - - putnext(c); - return(token_new2(_line, _col, "!")); - - case STATE_MOD: - if(c == '=') { - return(token_new2(_line, _col, "%=")); - } - - putnext(c); - return(token_new2(_line, _col, "%")); - - case STATE_AND: - switch(c) { - case '&': - return(token_new2(_line, _col, "&&")); - - case '=': - return(token_new2(_line, _col, "&=")); - - default: - putnext(c); - return(token_new2(_line, _col, "&")); - } - - case STATE_OR: - switch(c) { - case '|': - return(token_new2(_line, _col, "||")); - - case '=': - return(token_new2(_line, _col, "|=")); - - default: - putnext(c); - return(token_new2(_line, _col, "|")); - } - - case STATE_XOR: - if(c == '=') { - return(token_new2(_line, _col, "^=")); - } - - putnext(c); - return(token_new2(_line, _col, "^")); - - case STATE_ASSIGN: - if(c == '=') { - return(token_new2(_line, _col, "==")); - } - - putnext(c); - return(token_new2(_line, _col, "=")); - - case STATE_STRING: - if(!tok) { - tok = token_new2(_line, _col, "\""); - assert(tok); - } - - str_appendc(tok->value, c); - - if(c == '\\') { - state = STATE_STRING_ESC; - } else if(c == '"') { - return(tok); - } - - break; - - case STATE_STRING_ESC: - str_appendc(tok->value, c); - state = STATE_STRING; - break; - - case STATE_CHR: - if(!tok) { - tok = token_new2(_line, _col, "'"); - assert(tok); - } - - str_appendc(tok->value, c); - - if(c == '\\') { - state = STATE_CHR_ESC; - } else if(c == '\'') { - return(tok); - } - - break; - - case STATE_CHR_ESC: - str_appendc(tok->value, c); - state = STATE_CHR; - break; - - case STATE_ID: - if(identifier_chr(c)) { - str_appendc(tok->value, c); - } else { - putnext(c); - return(tok); - } - - break; - - case STATE_MUL: - switch(c) { - case '=': - return(token_new2(_line, _col, "*=")); - - default: - putnext(c); - return(token_new2(_line, _col, "*")); - } - - case STATE_ADD: - switch(c) { - case '+': - return(token_new2(_line, _col, "++")); - - case '=': - return(token_new2(_line, _col, "+=")); - - default: - putnext(c); - return(token_new2(_line, _col, "+")); - } - - case STATE_SUB: - switch(c) { - case '-': - return(token_new2(_line, _col, "--")); - - case '=': - return(token_new2(_line, _col, "-=")); - - case '>': - return(token_new2(_line, _col, "->")); - - default: - putnext(c); - return(token_new2(_line, _col, "-")); - } - - case STATE_DOT: - putnext(c); - - if(c >= '0' && c <= '9') { - state = STATE_NUM; - } else { - return(token_new2(_line, _col, ".")); - } - - break; - - case STATE_ZERO: - if(c == 'x' || c == 'X') { - str_appendc(tok->value, 'x'); - state = STATE_NUM; - } else if(c >= '0' && c <= '9') { - putnext(c); - state = STATE_NUM; - } else if(c == '.') { - - } else { - putnext(c); - return(tok); - } - - break; - - case STATE_NUM: - /* FIXME: e, E may be in the middle, (u|U)(l|L|ll|LL) or vice versa may be at the end */ - if(c >= '0' && c <= '9' || c == '.') { - str_appendc(tok->value, c); - } else { - putnext(c); - return(tok); - } - - break; - } - } - - return(tok); -} - -int main(int argc, char *argv[]) -{ - int ret_val; - struct token *tok; - - ret_val = 0; - - while((tok = nexttoken())) { - printf("Token at %4d:%3d: \"%s\"\n", tok->line, tok->column, str_value(tok->value)); - } - - return(ret_val); -}