]> git.corax.cc Git - ccc/commitdiff
tokenize: Implement correct tokenization of comments and operators starting with...
authorMatthias Kruk <m@m10k.eu>
Thu, 28 May 2020 00:27:28 +0000 (09:27 +0900)
committerMatthias Kruk <m@m10k.eu>
Thu, 28 May 2020 00:27:28 +0000 (09:27 +0900)
src/tokenize.c

index 73fe060a3b78ef41b55464bb36eec9904eb79efa..2f6841df71a6c7c4621239c3994a8c74686d1561 100644 (file)
@@ -6,11 +6,16 @@
 #include "list.h"
 #include "token.h"
 
-#define TABWIDTH      8
-#define STATE_NONE    0
-#define STATE_COMMENT 1
-#define STATE_OP      2
-#define STATE_DIV     3
+#define TABWIDTH          8
+
+#define STATE_NONE        0
+#define STATE_COMMENT     1
+#define STATE_COMMENT_END 2
+#define STATE_DIV         3
+#define STATE_LT          4
+#define STATE_SHL         5
+#define STATE_GT          6
+#define STATE_SHR         7
 
 #define STATE_DONE    8
 
@@ -79,167 +84,16 @@ int putnext(const char c)
        return(ret_val);
 }
 
-#if 0
-int comment(void)
-{
-       struct token *tok;
-       int cstate;
-       char c;
-
-#define CSTATE_HEAD 0
-#define CSTATE_BODY 1
-#define CSTATE_FOOT 2
-#define CSTATE_DONE 3
-
-       cstate = CSTATE_HEAD;
-
-       tok = token_new(TOKEN_COMMENT);
-       assert(tok);
-
-       token_setpos(tok, _line, _col);
-
-       str_appendc(tok->value, '/');
-
-       while(cstate != CSTATE_DONE) {
-               c = getnext();
-
-               str_appendc(tok->value, c);
-
-               switch(cstate) {
-               case CSTATE_HEAD:
-                       assert(c == '*');
-                       cstate = CSTATE_BODY;
-                       break;
-
-               case CSTATE_BODY:
-                       if(c == '*') {
-                               cstate = CSTATE_FOOT;
-                       }
-                       break;
-
-               case CSTATE_FOOT:
-                       if(c == '/') {
-                               cstate = CSTATE_DONE;
-                       } else {
-                               cstate = CSTATE_BODY;
-                       }
-
-                       break;
-
-               default:
-                       assert(0);
-                       break;
-               }
-       }
-
-       list_append(&tokens, tok);
-
-       return(0);
-}
-
-int div(void)
-{
-       char c;
-
-       c = getnext();
-
-int none(void)
-{
-       char c;
-       int ret_val;
-       char lookahead;
-
-       c = getnext();
-
-       if(!c) {
-               return(STATE_DONE);
-       }
-
-       switch(c) {
-       case ' ':
-       case '\t':
-       case '\n':
-               ret_val = STATE_NONE;
-               break;
-
-       case '/':
-               lookahead = getnext();
-
-               if(lookahead == '*') {
-                       ret_val = STATE_COMMENT;
-               } else {
-                       ret_val = STATE_DIV;
-               }
-
-               putnext(lookahead);
-
-               break;
-
-       default:
-               ret_val = STATE_NONE;
-               break;
-/*
-       case '"':
-       case '\'':
-       case '0':
-       case '1':
-       case '2':
-       case '3':
-       case '4':
-       case '5':
-       case '6':
-       case '7':
-       case '8':
-       case '9':
-               ret_val = STATE_LITERAL;
-               break;
-               */
-/*
-       case '+':
-       case '-':
-       case '*':
-       case '=':
-               ret_val = STATE_OP;
-               break;
-*/
-/*
-       case '<':
-       case '>':
-       case '?':
-       case ',':
-       case ';':
-       case ':':
-       case '!':
-       case '%':
-       case '&':
-       case '(':
-       case ')':
-       case '~':
-       case '^':
-       case '|':
-       case '[':
-       case ']':
-       case '{':
-       case '}':
-*/
-       }
-
-       return(ret_val);
-}
-#endif /* 0 */
-
 struct token *nexttoken(void)
 {
        struct token *tok;
-
        int state;
+       char c;
 
        state = STATE_NONE;
        tok = NULL;
 
-       while(1) {
-               char c = getnext();
-
+       while((c = getnext())) {
                switch(state) {
                case STATE_NONE:
                        switch(c) {
@@ -247,6 +101,14 @@ struct token *nexttoken(void)
                                state = STATE_DIV;
                                break;
 
+                       case '<':
+                               state = STATE_LT;
+                               break;
+
+                       case '>':
+                               state = STATE_GT;
+                               break;
+
                        case '\r':
                                fprintf(stderr, "DOS user, eh?\n");
                        case ' ':
@@ -256,7 +118,7 @@ struct token *nexttoken(void)
 
                        default:
                                /* unrecognized token */
-                               fprintf(stderr, "Unrecognized token at %d:%d\n", _line, _col);
+                               fprintf(stderr, "Unrecognized token at %d:%d [%02x]\n", _line, _col, c);
                                return(NULL);
                        }
 
@@ -270,7 +132,6 @@ struct token *nexttoken(void)
 
                        case '=':
                                return(token_new2(_line, _col, "/="));
-                               break;
 
                        default:
                                putnext(c);
@@ -287,11 +148,71 @@ struct token *nexttoken(void)
 
                        str_appendc(tok->value, c);
 
-                       if(c == '"') {
+                       if(c == '*') {
+                               state = STATE_COMMENT_END;
+                       }
+
+                       break;
+
+               case STATE_COMMENT_END:
+                       str_appendc(tok->value, c);
+
+                       if(c == '/') {
                                return(tok);
                        }
 
+                       state = STATE_COMMENT;
+                       break;
+
+               case STATE_LT:
+                       switch(c) {
+                       case '<':
+                               state = STATE_SHL;
+                               break;
+
+                       case '=':
+                               return(token_new2(_line, _col, "<="));
+                               break;
+
+                       default:
+                               putnext(c);
+                               return(token_new2(_line, _col, "<"));
+                       }
+
                        break;
+
+               case STATE_SHL:
+                       if(c == '=') {
+                               return(token_new2(_line, _col, "<<="));
+                       }
+
+                       putnext(c);
+                       return(token_new2(_line, _col, "<<"));
+
+               case STATE_GT:
+                       switch(c) {
+                       case '>':
+                               state = STATE_SHR;
+                               break;
+
+                       case '=':
+                               return(token_new2(_line, _col, ">="));
+
+                       default:
+                               putnext(c);
+                               return(token_new2(_line, _col, ">"));
+                       }
+
+                       break;
+
+               case STATE_SHR:
+                       if(c == '=') {
+                               return(token_new2(_line, _col, ">>="));
+                       }
+
+                       putnext(c);
+                       return(token_new2(_line, _col, ">>"));
+
                }
        }