ref: 950ee74350cdb6b7ff170234fd9341f1b094d171
parent: 993d1af900057e362e08ea738e1f6f6b409c749e
author: Ori Bernstein <[email protected]>
date: Wed Dec 18 11:42:55 EST 2013
Support utf8 character values.
--- a/parse/gram.y
+++ b/parse/gram.y
@@ -626,7 +626,7 @@
littok : Tstrlit {$$ = mkstr($1->line, $1->str);}
| Tintlit {$$ = mkint($1->line, $1->intval);}
- | Tchrlit {$$ = mkchar($1->line, *$1->str);} /* FIXME: expand escapes, unicode */
+ | Tchrlit {$$ = mkchar($1->line, $1->chrval);}
| Tfloatlit {$$ = mkfloat($1->line, $1->fltval);}
| Tboollit {$$ = mkbool($1->line, !strcmp($1->str, "true"));}
;
--- a/parse/parse.h
+++ b/parse/parse.h
@@ -86,6 +86,7 @@
/* values parsed out */
vlong intval;
double fltval;
+ uint32_t chrval;
};
struct Stab {
--- a/parse/tok.c
+++ b/parse/tok.c
@@ -208,7 +208,7 @@
{
size_t i;
char c;
-
+
i = 0;
for (c = peek(); i < sz && identchar(c); c = peek()) {
next();
@@ -332,36 +332,60 @@
return t;
}
+static uint32_t readutf(char c, char **buf, size_t *buflen, size_t *sz) {
+ size_t i, len;
+ uint32_t val;
+
+ if ((c & 0x80) == 0)
+ len = 1;
+ else if ((c & 0xe0) == 0xc0)
+ len = 2;
+ else if ((c & 0xf0) == 0xe0)
+ len = 3;
+ else if ((c & 0xf8) == 0xf0)
+ len = 4;
+
+ val = c & ((1 << (8 - len)) - 1);
+ append(buf, buflen, sz, c);
+ for (i = 1; i < len; i++) {
+ c = next();
+ if ((c & 0xc0) != 0x80)
+ fatal(line, "Invalid utf8 codepoint in character literal");
+ val = (val << 6) | (c & 0x3f);
+ append(buf, buflen, sz, c);
+ }
+ return val;
+}
+
static Tok *charlit()
{
Tok *t;
int c;
+ uint32_t val;
size_t len, sz;
char *buf;
+
assert(next() == '\'');
buf = NULL;
len = 0;
sz = 0;
- while (1) {
- c = next();
- /* we don't unescape here, but on output */
- if (c == '\'')
- break;
- else if (c == End)
- fatal(line, "Unexpected EOF within char lit");
- else if (c == '\n')
- fatal(line, "Newlines not allowed in char lit");
- else if (c == '\\')
- decode(&buf, &len, &sz);
- else
- append(&buf, &len, &sz, c);
-
- };
+ c = next();
+ if (c == End)
+ fatal(line, "Unexpected EOF within char lit");
+ else if (c == '\n')
+ fatal(line, "Newlines not allowed in char lit");
+ else if (c == '\\')
+ decode(&buf, &len, &sz);
+ else
+ val = readutf(c, &buf, &len, &sz);
append(&buf, &len, &sz, '\0');
+ if (next() != '\'')
+ fatal(line, "Character constant with multiple characters");
t = mktok(Tchrlit);
+ t->chrval = val;
t->str = buf;
return t;
}