ref: 346140113cb979d60f27fb3795f08e5976c0c712
parent: 3f5de2514eb4a90a8f7925bbeaf6dfdfb07debf8
author: Ori Bernstein <[email protected]>
date: Wed Dec 26 16:43:45 EST 2018
Implement `std.bygrapheme`.
--- a/lib/std/striter.myr
+++ b/lib/std/striter.myr
@@ -4,6 +4,8 @@
use "strfind"
use "option"
use "chartype"
+use "slpush"
+use "alloc"
pkg std =
type chariter = struct
@@ -10,6 +12,10 @@
rest : byte[:]
;;
+ type graphemeiter = struct
+ rest : byte[:]
+ ;;
+
type charoffiter = struct
str : byte[:]
idx : size
@@ -25,17 +31,24 @@
idx : size
;;
- impl iterable chariter -> char
+ impl iterable chariter -> char
+ impl iterable graphemeiter -> char[:]
impl iterable charoffiter -> (char, size)
- impl iterable splititer -> byte[:]
- impl iterable tokiter -> byte[:]
+ impl iterable splititer -> byte[:]
+ impl iterable tokiter -> byte[:]
- const bychar : (str : byte[:] -> chariter)
- const bycharoff : (str : byte[:] -> charoffiter)
- const bysplit : (str : byte[:], split : byte[:] -> splititer)
- const bytok : (str : byte[:] -> tokiter)
+ const bychar : (str : byte[:] -> chariter)
+ const bygrapheme : (str : byte[:] -> graphemeiter)
+ const bycharoff : (str : byte[:] -> charoffiter)
+ const bysplit : (str : byte[:], split : byte[:] -> splititer)
+ const bytok : (str : byte[:] -> tokiter)
;;
+/*
+ * Iterate through a string char by char,
+ * decoding the utf8 bytes into a single
+ * codepoint.
+ */
impl iterable chariter -> char =
__iternext__ = {ci, c
if ci.rest.len == 0
@@ -53,7 +66,50 @@
-> [.rest = str]
}
+/*
+ * Iterate through a string grapheme by grapheme,
+ * returning a slice of characters composing the
+ * grapheme.
+ */
+impl iterable graphemeiter -> char[:] =
+ __iternext__ = {ci, g : char[:]#
+ var gb, gc : char[:]
+ if ci.rest.len == 0
+ -> false
+ ;;
+ (gb, ci.rest) = graphemestep(ci.rest)
+ /*
+ * Graphemestep returns bytes, but we
+ * want to a slice of chars.
+ */
+ gc = [][:]
+ for c : std.bychar(gb)
+ std.slpush(&gc, c)
+ ;;
+ g# = gc
+ -> true
+ }
+
+ __iterfin__ = {ci, g
+ std.slfree(g#)
+ }
+;;
+
+const bygrapheme = {str
+ -> [.rest = str]
+}
+
+
+/*
+ * Iterates through a string character by
+ * character, similar to chariter, but returns
+ * the offset into the string of the codepoint.
+ * For example,
+ * "ὐbὐc
+ * would return the sequence:
+ * (ὐ, 0), (b, 3), (ὐ, 4), (c, 7)
+ */
impl iterable charoffiter -> (char, size) =
__iternext__ = {ci, cv
var c
@@ -75,6 +131,10 @@
-> [.str=s, .idx=0]
}
+/*
+ * Iterates through the splits of a string by a
+ * delimiter, skippin gthe delimiter.
+ */
impl iterable splititer -> byte[:] =
__iternext__ = {si, sp
match std.strfind(si.rest, si.split)
@@ -100,6 +160,10 @@
-> [.rest = str, .split = split]
}
+/*
+ * Tokenizes a string by spaces, iterating over
+ * the results.
+ */
impl iterable tokiter -> byte[:] =
__iternext__ = {it, sp
var s, lo, hi, c
--- a/lib/std/test/striter.myr
+++ b/lib/std/test/striter.myr
@@ -3,6 +3,18 @@
const main = {
var chars = ['a', 'b', 'c']
var splits = ["foo", "+bar"]
+ var graphemes = [
+ [0x300][:],
+ [0x61][:],
+ [0x53f2][:],
+ [0x63][:],
+ [0x9][:],
+ [0x42f][:],
+ [0x78, 0x300, 0x300, 0x300, 0x300, 0x300][:],
+ [0xa][:],
+ [0x7a, 0x309][:]
+ ]
+
var i
i = 0
@@ -16,4 +28,10 @@
std.assert(std.eq(splits[i++], sp), "wrong split {}", sp)
;;
std.assert(i == splits.len, "wrong split count")
+
+ i = 0
+ for g : std.bygrapheme("̀a史c\tЯx̀̀̀̀̀\nz̉")
+ std.assert(std.eq(g, graphemes[i++]), "mismatched grapheme cluster\n")
+ ;;
+ std.assert(i == graphemes.len, "wrong grapheme set length")
}