ref: 9f10f9d30d224a28da8fffb93ab53d338ecb6838
parent: 09a8efbc7e59523a6df366fffee7b207f3327c68
author: Ori Bernstein <[email protected]>
date: Fri Dec 27 09:07:13 EST 2013
Add support for \u{...} escape sequences.
--- a/parse/parse.h
+++ b/parse/parse.h
@@ -1,7 +1,6 @@
#define FATAL __attribute__((noreturn))
typedef uint8_t byte;
-typedef uint32_t unichar;
typedef unsigned int uint;
typedef unsigned long ulong;
typedef long long vlong;
@@ -196,7 +195,7 @@
union {
uvlong intval;
double fltval;
- unichar chrval;
+ uint32_t chrval;
char *strval;
char *lblval;
int boolval;
--- a/parse/tok.c
+++ b/parse/tok.c
@@ -230,10 +230,6 @@
return t;
}
-/*
- * Appends a character 'c' to a growable buffer 'buf',
- * resizing if needed.
- */
static void append(char **buf, size_t *len, size_t *sz, int c)
{
if (!*sz) {
@@ -248,7 +244,61 @@
buf[0][len[0]++] = c;
}
+
+static void encode(char *buf, size_t len, uint32_t c)
+{
+ int mark;
+ size_t i;
+
+ assert(len > 0 && len < 5);
+ if (len == 1)
+ mark = 0;
+ else
+ mark = (((1 << (8 - len)) - 1) ^ 0xff);
+ for (i = len - 1; i > 0; i--) {
+ buf[i] = (c & 0x3f) | 0x80;
+ c >>= 6;
+ }
+ buf[0] = (c | mark);
+}
+
/*
+ * Appends a unicode codepoint 'c' to a growable buffer 'buf',
+ * resizing if needed.
+ */
+static void appendc(char **buf, size_t *len, size_t *sz, uint32_t c)
+{
+ size_t i, charlen;
+ char charbuf[5] = {0};
+
+ if (c < 0x80)
+ charlen = 1;
+ else if (c < 0x800)
+ charlen = 2;
+ else if (c < 0x10000)
+ charlen = 3;
+ else if (c < 0x200000)
+ charlen = 4;
+ else
+ fatal(line, "invalid utf character '\\u{%x}'", c);
+
+ encode(charbuf, charlen, c);
+ for (i = 0; i < charlen; i++)
+ append(buf, len, sz, charbuf[i]);
+}
+
+static int ishexval(char c)
+{
+ if (c >= 'a' && c <= 'f')
+ return 1;
+ else if (c >= 'A' && c <= 'F')
+ return 1;
+ else if (c >= '0' && c <= '9')
+ return 1;
+ return 0;
+}
+
+/*
* Converts a character to its hex value.
*/
static int hexval(char c)
@@ -259,10 +309,31 @@
return c - 'A' + 10;
else if (c >= '0' && c <= '9')
return c - '0';
- die("passed non-hex value '%c' to hexval()", c);
+ fatal(line, "passed non-hex value '%c' to where hex was expected", c);
return -1;
}
+/* \u{abc} */
+static int32_t unichar()
+{
+ uint32_t v;
+ int c;
+
+ /* we've already seen the \u */
+ if (next() != '{')
+ fatal(line, "\\u escape sequence without initial '{'");
+ v = 0;
+ while (ishexval(peek())) {
+ c = next();
+ v = 16*v + hexval(c);
+ if (v > 0x10FFFF)
+ fatal(line, "invalid codepoint for \\u escape sequence");
+ }
+ if (next() != '}')
+ fatal(line, "\\u escape sequence without ending '}'");
+ return v;
+}
+
/*
* decodes an escape code. These are
* shared between strings and characters.
@@ -271,7 +342,7 @@
static int decode(char **buf, size_t *len, size_t *sz)
{
char c, c1, c2;
- int v;
+ int32_t v;
c = next();
/* we've already seen the '\' */
@@ -285,6 +356,7 @@
fatal(line, "expected hex digit, got %c", c1);
v = 16*hexval(c1) + hexval(c2);
break;
+ case 'u': v = unichar(); break;
case 'n': v = '\n'; break;
case 'r': v = '\r'; break;
case 't': v = '\t'; break;
@@ -296,7 +368,7 @@
case '0': v = '\0'; break;
default: fatal(line, "unknown escape code \\%c", c);
}
- append(buf, len, sz, v);
+ appendc(buf, len, sz, v);
return v;
}
--- a/test/tests
+++ b/test/tests
@@ -26,6 +26,7 @@
B div E 42
B mod E 6
B bsr E 5
+B chartest E 0
B trunccast E 15
B zwidencast E 99
B swidencast E 99