shithub: mc

Download patch

ref: 950ee74350cdb6b7ff170234fd9341f1b094d171
parent: 993d1af900057e362e08ea738e1f6f6b409c749e
author: Ori Bernstein <[email protected]>
date: Wed Dec 18 11:42:55 EST 2013

Support utf8 character values.

--- a/parse/gram.y
+++ b/parse/gram.y
@@ -626,7 +626,7 @@
 
 littok  : Tstrlit       {$$ = mkstr($1->line, $1->str);}
         | Tintlit       {$$ = mkint($1->line, $1->intval);}
-        | Tchrlit       {$$ = mkchar($1->line, *$1->str);} /* FIXME: expand escapes, unicode  */
+        | Tchrlit       {$$ = mkchar($1->line, $1->chrval);}
         | Tfloatlit     {$$ = mkfloat($1->line, $1->fltval);}
         | Tboollit      {$$ = mkbool($1->line, !strcmp($1->str, "true"));}
         ;
--- a/parse/parse.h
+++ b/parse/parse.h
@@ -86,6 +86,7 @@
     /* values parsed out */
     vlong intval;
     double fltval;
+    uint32_t chrval;
 };
 
 struct Stab {
--- a/parse/tok.c
+++ b/parse/tok.c
@@ -208,7 +208,7 @@
 {
     size_t i;
     char c;
-    
+
     i = 0;
     for (c = peek(); i < sz && identchar(c); c = peek()) {
         next();
@@ -332,36 +332,60 @@
     return t;
 }
 
+static uint32_t readutf(char c, char **buf, size_t *buflen, size_t *sz) {
+    size_t i, len;
+    uint32_t val;
+
+    if ((c & 0x80) == 0)
+        len = 1;
+    else if ((c & 0xe0) == 0xc0)
+        len = 2;
+    else if ((c & 0xf0) == 0xe0)
+        len = 3;
+    else if ((c & 0xf8) == 0xf0)
+        len = 4;
+
+    val = c & ((1 << (8 - len)) - 1);
+    append(buf, buflen, sz, c);
+    for (i = 1; i < len; i++) {
+        c = next();
+        if ((c & 0xc0) != 0x80)
+            fatal(line, "Invalid utf8 codepoint in character literal");
+        val = (val << 6) | (c & 0x3f);
+        append(buf, buflen, sz, c);
+    }
+    return val;
+}
+
 static Tok *charlit()
 {
     Tok *t;
     int c;
+    uint32_t val;
     size_t len, sz;
     char *buf;
 
+
     assert(next() == '\'');
 
     buf = NULL;
     len = 0;
     sz = 0;
-    while (1) {
-        c = next();
-        /* we don't unescape here, but on output */
-        if (c == '\'')
-            break;
-        else if (c == End)
-            fatal(line, "Unexpected EOF within char lit");
-        else if (c == '\n')
-            fatal(line, "Newlines not allowed in char lit");
-        else if (c == '\\')
-            decode(&buf, &len, &sz);
-        else
-            append(&buf, &len, &sz, c);
-
-    };
+    c = next();
+    if (c == End)
+        fatal(line, "Unexpected EOF within char lit");
+    else if (c == '\n')
+        fatal(line, "Newlines not allowed in char lit");
+    else if (c == '\\')
+        decode(&buf, &len, &sz);
+    else
+        val = readutf(c, &buf, &len, &sz);
     append(&buf, &len, &sz, '\0');
+    if (next() != '\'')
+        fatal(line, "Character constant with multiple characters");
 
     t = mktok(Tchrlit);
+    t->chrval = val;
     t->str = buf;
     return t;
 }