tokenize: Implement recognition of identifiers, numeric literals, and several operators

author Matthias Kruk <m@m10k.eu>

Sat, 30 May 2020 09:24:34 +0000 (18:24 +0900)

committer Matthias Kruk <m@m10k.eu>

Sat, 30 May 2020 09:24:34 +0000 (18:24 +0900)
author Matthias Kruk <m@m10k.eu>
Sat, 30 May 2020 09:24:34 +0000 (18:24 +0900)
committer Matthias Kruk <m@m10k.eu>
Sat, 30 May 2020 09:24:34 +0000 (18:24 +0900)
diff --git a/src/tokenize.c b/src/tokenize.c

index 88371e2867a908fa066f7a97140834d64d0a209c..9790f6d9662390418e750a86ab2cec0dd2340a18 100644 (file)
--- a/src/tokenize.c
+++ b/src/tokenize.c
@@ -26,9 +26,22 @@
  #define STATE_STRING_ESC  15
  #define STATE_CHR         16
  #define STATE_CHR_ESC     17
+#define STATE_ID          18
+#define STATE_ADD         19
+#define STATE_SUB         20
+#define STATE_MUL         21
+#define STATE_DOT         22
+#define STATE_ZERO        23
+#define STATE_NUM         24
  
  #define STATE_DONE    8
  
+#define identifier_firstchr(_c) ((_c) == '_' ||                        \
+                                (_c) >= 'a' && (_c) <= 'z' ||  \
+                                (_c) >= 'A' && (_c) <= 'Z')
+#define identifier_chr(_c)      (identifier_firstchr(_c) ||    \
+                                (_c) >= '0' && (_c) <= '9')
+
  static char _next = 0;
  static int _state = STATE_NONE;
  static int _line = 1;
@@ -146,7 +159,40 @@ struct token *nexttoken(void)
                         case '"':
                                 state = STATE_STRING;
                                 break;
-                               
+
+                       case '\'':
+                               state = STATE_CHR;
+                               break;
+
+                       case '*':
+                               state = STATE_MUL;
+                               break;
+
+                       case '+':
+                               state = STATE_ADD;
+                               break;
+
+                       case '-':
+                               state = STATE_SUB;
+                               break;
+
+                       case '.':
+                               state = STATE_DOT;
+                               break;
+
+                       case '~':
+                       case '(':
+                       case ')':
+                       case '{':
+                       case '}':
+                       case '[':
+                       case ']':
+                       case ':':
+                       case ';':
+                       case ',':
+                       case '?':
+                               return(token_new_from_char(_line, _col, c));
+
                         case '\r':
                                 fprintf(stderr, "DOS user, eh?\n");
                         case ' ':
@@ -155,8 +201,26 @@ struct token *nexttoken(void)
                                 break;
  
                         default:
+                               if(identifier_firstchr(c)) {
+                                       /* looks like an identifier */
+                                       tok = token_new_from_char(_line, _col, c);
+                                       assert(tok);
+                                       state = STATE_ID;
+                                       break;
+                               } else if(c == '0') {
+                                       tok = token_new_from_char(_line, _col, c);
+                                       assert(tok);
+                                       state = STATE_ZERO;
+                                       break;
+                               } else if(c > '0' && c <= '9') {
+                                       tok = token_new_from_char(_line, _col, c);
+                                       assert(tok);
+                                       state = STATE_NUM;
+                                       break;
+                               }
+
                                 /* unrecognized token */
-                               fprintf(stderr, "Unrecognized token at %d:%d [%02x]\n", _line, _col, c);
+                               fprintf(stderr, "Unrecognized token at %d:%d ['%c']\n", _line, _col, c);
                                 return(NULL);
                         }
  
@@ -316,7 +380,7 @@ struct token *nexttoken(void)
                         }
  
                         str_appendc(tok->value, c);
-                       
+
                         if(c == '\\') {
                                 state = STATE_STRING_ESC;
                         } else if(c == '"') {
@@ -350,6 +414,91 @@ struct token *nexttoken(void)
                         str_appendc(tok->value, c);
                         state = STATE_CHR;
                         break;
+
+               case STATE_ID:
+                       if(identifier_chr(c)) {
+                               str_appendc(tok->value, c);
+                       } else {
+                               putnext(c);
+                               return(tok);
+                       }
+
+                       break;
+
+               case STATE_MUL:
+                       switch(c) {
+                       case '=':
+                               return(token_new2(_line, _col, "*="));
+
+                       default:
+                               putnext(c);
+                               return(token_new2(_line, _col, "*"));
+                       }
+
+               case STATE_ADD:
+                       switch(c) {
+                       case '+':
+                               return(token_new2(_line, _col, "++"));
+
+                       case '=':
+                               return(token_new2(_line, _col, "+="));
+
+                       default:
+                               putnext(c);
+                               return(token_new2(_line, _col, "+"));
+                       }
+
+               case STATE_SUB:
+                       switch(c) {
+                       case '-':
+                               return(token_new2(_line, _col, "--"));
+
+                       case '=':
+                               return(token_new2(_line, _col, "-="));
+
+                       default:
+                               putchar(c);
+                               return(token_new2(_line, _col, "-"));
+                       }
+
+               case STATE_DOT:
+                       putnext(c);
+
+                       if(c >= '0' && c <= '9') {
+                               state = STATE_NUM;
+                       } else {
+                               return(token_new2(_line, _col, "."));
+                       }
+
+                       break;
+
+               case STATE_ZERO:
+                       if(c == 'x' || c == 'X') {
+                               str_appendc(tok->value, 'x');
+                               state = STATE_NUM;
+                       } else if(c >= '0' && c <= '9') {
+                               putnext(c);
+                               state = STATE_NUM;
+                       } else if(c == '.') {
+
+                       } else {
+                               putnext(c);
+                               return(tok);
+                       }
+
+                       break;
+
+               case STATE_NUM:
+                       /* FIXME: e, E may be in the middle, (u|U)(l|L|ll|LL) or vice versa may be at the end */
+
+                       if(c >= '0' && c <= '9' || c == '.') {
+                               str_appendc(tok->value, c);
+                       } else {
+                               putnext(c);
+                               return(tok);
+                       }
+
+                       break;
                 }
         }
author	Matthias Kruk <m@m10k.eu>
	Sat, 30 May 2020 09:24:34 +0000 (18:24 +0900)
committer	Matthias Kruk <m@m10k.eu>
	Sat, 30 May 2020 09:24:34 +0000 (18:24 +0900)