ref: 6095d59f1bcd58c59ad3ac4271172c7037c6bda0
parent: ea3c3bcded6031ba1f72e1af6683e0c6bcade96d
author: Ori Bernstein <[email protected]>
date: Sun Jul 29 20:44:43 EDT 2012
Add in more unicode stuff.
--- a/bld.sh
+++ b/bld.sh
@@ -46,6 +46,7 @@
die.myr \
alloc.myr\
str.myr \
+ fmt.myr \
chartype.myr"
OBJ="$(echo $ASM | sed 's/\.s/.o /g') $(echo $MYR | sed 's/\.myr/.o /g')"
@@ -65,4 +66,8 @@
echo $COMP
$COMP
+build f.myr
+COMP="$CC -m32 -o f f.o -L. -lstd"
+echo $COMP
+$COMP
--- a/chartype.myr
+++ b/chartype.myr
@@ -1036,11 +1036,12 @@
0x1ffc, 491 /* ῼ ῳ */
]
-const bsearch = {c, t, sz, nelt, ret
+const findc = {c, t, sz, nelt, ret
var l
var m
- while l.len > 1
+ l = t
+ while l.len > nelt
m = l.len/2
l = t[m+nelt, t.len]
if c >= l[0]
@@ -1059,18 +1060,16 @@
}
-const isalpha = {chr
+const isalpha = {c
var l
- var c
- c = chr castto(int)
- if isupper(chr) || islower(chr)
+ if isupper(c) || islower(c)
-> true
- elif bsearch(c, tabalpha2[0, tabalpha2.len], tabalpha2.len, 2, &l)
+ elif findc(c, tabalpha2[0, tabalpha2.len], tabalpha2.len, 2, &l)
if (c >= l[0] && c <= l[1])
-> true
;;
- elif bsearch(c, tabalpha1[0, tabalpha1.len], tabalpha1.len, 1, &l)
+ elif findc(c, tabalpha1[0, tabalpha1.len], tabalpha1.len, 1, &l)
if (c == l[0])
-> true
;;
@@ -1078,11 +1077,11 @@
-> false
}
-const isnum = {chr
+const isnum = {c
var l
var c
- if bsearch(c, tabisdigitr[0, tabisdigitr.len], tabisdigitr.len/2, 2, &l)
+ if findc(c, tabisdigitr[0, tabisdigitr.len], tabisdigitr.len/2, 2, &l)
if(c >= l[0] && c <= l[1])
-> true
;;
@@ -1089,14 +1088,20 @@
;;
-> false
}
-const isalnum = {chr
- -> isalpha(chr) || isnum(chr)
+
+const isalnum = {c
+ -> isalpha(c) || isnum(c)
}
-const isspace = {chr
+
+const isspace = {c
var l
var c
+ var sl
+ var len
- if bsearch(c, tabspace2[0,tabspace2.len], tabspace2.len/2, 2, &l)
+ sl = tabspace2[0,tabspace2.len]
+ len = tabspace2.len/2
+ if findc(c, sl, len, 2, &l)
if(c >= l[0] && c <= l[1])
-> true
;;
@@ -1104,16 +1109,15 @@
-> false
}
-const islower = {chr
+const islower = {c
var l
var c
- c = chr castto(int)
- if bsearch(c, tabtoupper2[0, tabtoupper2.len], tabtoupper2.len, 2, &l)
+ if findc(c, tabtoupper2[0, tabtoupper2.len], tabtoupper2.len, 2, &l)
if (c >= l[0] && c <= l[1])
-> true
;;
- elif bsearch(c, tabtoupper1[0, tabtoupper1.len], tabtoupper1.len, 1, &l)
+ elif findc(c, tabtoupper1[0, tabtoupper1.len], tabtoupper1.len, 1, &l)
if (c == l[0])
-> true
;;
@@ -1121,15 +1125,15 @@
-> false
}
-const isupper = {chr
+const isupper = {c
var l
var c
- if bsearch(c, tabtolower2[0, tabtolower2.len], tabtolower2.len, 2, &l)
+ if findc(c, tabtolower2[0, tabtolower2.len], tabtolower2.len, 2, &l)
if (c >= l[0] && c <= l[1])
-> true
;;
- elif bsearch(c, tabtolower1[0, tabtolower1.len], tabtolower1.len, 1, &l)
+ elif findc(c, tabtolower1[0, tabtolower1.len], tabtolower1.len, 1, &l)
if (c == l[0])
-> true
;;
--- a/str.myr
+++ b/str.myr
@@ -5,10 +5,11 @@
pkg std =
const Badchar : char = -1 castto(char)
- const encode : (buf : byte[,], chr : char -> bool)
- const decode : (str : byte[,] -> char)
-
+ const charlen : (chr : char -> int)
+ const encode : (chr : char, buf : byte[,] -> bool)
+ const decode : (buf : byte[,] -> char)
const striter : (str : byte[,] -> [char, byte[,]])
+
const strjoin : (lst : byte[,][,], delim:byte[,] -> byte[,])
const strsep : (str : byte[,], delim:byte[,] -> byte[,][,])
const strbjoin : (lst : byte[,][,], delim:byte[,] -> byte[,])
@@ -15,6 +16,53 @@
const strbsep : (str : byte[,], delim:byte[,] -> byte[,][,])
;;
+const charlen = {c
+ if c < 0x80
+ -> 1
+ elif c < 0x800
+ -> 2
+ elif c < 0x10000
+ -> 3
+ elif c < 0x200000
+ -> 4
+ else
+ -> -1
+ ;;
+}
+
+const encode = {c, buf
+ var len
+ var mark
+ var i
+
+ len = charlen(c)
+ if len < 0 || buf.len < len
+ -> false
+ ;;
+
+ if (len == 1)
+ mark = 0
+ else
+ mark = (((1 << (8 - len)) - 1) ^ 0xff) castto(char)
+ ;;
+
+ for i = len - 1; i > 0; i--
+ buf[i] = (c & 0x3f | 0x80) castto(byte)
+ c >>= 6
+ ;;
+
+ buf[0] = (c | mark) castto(byte)
+ -> true
+}
+
+const decode = {buf
+ var c
+ var b
+
+ (c, b) = striter(buf)
+ -> c
+}
+
const striter = {str
var len
var mask
@@ -23,7 +71,11 @@
var c
var tmp
+ if !str.len
+ -> (Badchar, str)
+ ;;
c = str[0]
+ len = 0
if c & 0x80 == 0 /* 0b0xxx_xxxx */
len = 1
elif c & 0xe0 == 0xc0 /* 0b110x_xxxx */
@@ -36,6 +88,10 @@
/* skip one char forward so we can try
resyncing the character stream */
-> (Badchar, str[1,str.len])
+ ;;
+
+ if len == 0 || len > str.len
+ -> (Badchar, str)
;;
mask = (1 << (7 - len)) - 1
--- a/test.myr
+++ b/test.myr
@@ -2,6 +2,7 @@
const main = {
var x : byte*[1024]
+ var buf : byte[1024]
var sz
var i
@@ -22,9 +23,9 @@
for i = 0; i < 1024; i++
std.free(x[i])
;;
- chartypes()
std.write(1, "Hello, 世界\n")
+ chartypes()
}
const chartypes = {
@@ -33,6 +34,12 @@
s = "世界 123\n"
for (c, s) = std.striter(s); s.len != 0; (c, s) = std.striter(s)
- c = c
+ if std.isspace(c)
+ std.write(1, "Space\n")
+ elif std.isalpha(c)
+ std.write(1, "Alpha\n")
+ elif std.isnum(c)
+ std.write(1, "Num\n")
+ ;;
;;
}