shithub: mc

ref: 2ffd051d75978386e2c63b5d0812dd9be982c38c
dir: /lib/std/utf.myr/

View raw version
use "extremum"
use "chartype"
use "die"
use "types"

pkg std =
	const Badchar	: char	= -1
	const Maxcharlen : size = 4
	const Maxcharval : char = 0x10FFFF


	const charlen	: (chr : char -> size)
	const encode	: (buf : byte[:], chr : char -> size)
	const decode	: (buf : byte[:] -> char)
	const charstep	: (str : byte[:] -> (char, byte[:]))
	const graphemestep : (str : byte[:] -> (byte[:], byte[:]))

	const strcellwidth : (str : byte[:] -> size)
;;

const charlen = {c
	if c < 0x80
		-> 1
	elif c < 0x800
		-> 2
	elif c < 0x10000
		-> 3
	elif c < 0x200000
		-> 4
	else
		-> 1 /* attempt to resync */
	;;
}

const encode = {buf, c
	var len
	var mark

	len = charlen(c)
	if len < 0 || buf.len < len
		-> -1
	;;

	if (len == 1)
		mark = 0
	else
		mark = ((1 << (8 - len) - 1) ^ 0xff : char)
	;;

	for var i = len - 1; i > 0; i--
		buf[i] = (c & 0x3f | 0x80 : byte)
		c >>= 6
	;;

	buf[0] = (c | mark : byte)
	-> len
}

const decode = {buf
	var c
	var b

	(c, b) = charstep(buf)
	-> c
}

const graphemestep = {str
	var len = 0
	var rest = str
	var c
	var cn = 0
	var width = 0

	while rest.len > 0
		(c, rest) = charstep(rest)
		cn = cellwidth(c)

		if (c == '\r' || c == '\n' || c == '\t')
			if len == 0
				-> (str[:1], str[1:])
			else
				-> (str[:len], str[len:])
			;;
		elif (cn > 0 || c == Badchar) && len > 0
			-> (str[:len], str[len:])
		elif c == Badchar
			-> (str[:1], str[1:])
		else
			len += charlen(c)
			width += cn
		;;
	;;

	-> (str[:len], str[len:])
}

const charstep = {str
	var len
	var mask
	var chr
	var c
	var tmp

	if str.len == 0
		/* empty string: no resync needed */
		-> (Badchar, str)
	;;
	c = str[0]
	len = 0
	if c & 0x80 == 0	/* 0b0xxx_xxxx */
		len = 1
	elif c & 0xe0 == 0xc0	/* 0b110x_xxxx */
		len = 2
	elif c & 0xf0 == 0xe0 	/* 0b1110_xxxx */
		len = 3
	elif c & 0xf8 == 0xf0 	/* 0b1111_0xxx */
		len = 4
	else
		/* skip one char forward so we can try
		   resyncing the character stream */
		-> (Badchar, str[1:])
	;;

	if len == 0 || len > str.len
		/* again, we want to try to resync */
		-> (Badchar, str[1:])
	;;

	mask = (1 << (8 - len)) - 1
	chr = (c : uint32) & mask
	for var i = 1; i < len; i++
		tmp = (str[i] : uint32)
		chr = (chr << 6) | (tmp & 0x3f)
	;;

	-> ((chr : char), str[len:])
}

const strcellwidth = {str
	var s : byte[:] = str
	var c : char = Badchar
	var n : size = 0

	while s.len > 0
		(c, s) = charstep(s)
		if s.len != 0 && c == Badchar
			/* Something will probably be printed as U+FFFD */
			n++
		elif c < 0x20
			/* Control characters take 0 cells */
		elif c < 0x7f
			/* Bog standard ASCII takes 1 cell */
			n++
		elif c == 0x7f
			/* DEL is like a control character */
		else
			/* It's not ASCII, so ask chartype what to do */
			n += (abs(cellwidth(c)) : size)
		;;
	;;

	-> n
}