shithub: mc

Download patch

ref: 363389e2b3fc56ad52922dc1903ca96129a3f5ed
parent: a2da7312d03152baea7311d4e0263f42fbc9a2e6
author: Ori Bernstein <[email protected]>
date: Tue Oct 22 10:30:39 EDT 2013

Add compilation of regexes.

--- a/compile.myr
+++ b/compile.myr
@@ -3,20 +3,415 @@
 use "types.use"
 
 pkg regex =
-	const compile	: (re : byte[:] -> regex#)
+	const compile	: (re : byte[:] -> std.error(regex#, status))
 ;;
 
-const compile = {re
-	var re
+type tree = union
+	/* basic string building */
+	`Alt	[tree#, tree#]
+	`Cat	[tree#, tree#]
+
+	/* repetition */
+	`Star	tree#
+	`Plus	tree#
+	`Quest	tree#	
+
+	/* end matches */
+	`Byte	byte
+	`Chr	char
+	`Class	[char, char][:]
+	`Dot
+
+	/* meta */
+	`Cap	tree#
+	`Astart
+	`Aend
+;;
+
+type parseresult = union
+	`Some tree#
+	`None
+	`Fail status
+;;
+
+const compile = {pat
+	var re : regex#
+
 	re = std.zalloc()
-	re.prog = std.slalloc(4)
-	/* compiled regex for a* */
-	re.prog[0] = `Byte ('a' castto(byte))
-	re.prog[1] = `Byte ('a' castto(byte))
-	re.prog[2] = `Split (0, 3)
-	re.prog[3] = `Match
+	re.pat = pat
+	match parse(re)
+	`None:		std.put("Empty parse\n");;
+	`Fail f:	-> `std.Failure f;;
+	`Some t:
+		dump(t, 0)
+		gen(re, t)
+		append(re, `Imatch)
+		idump(re)
+		;;
+	;;
+	-> `std.Failure (`Noimpl)
+}
 
-	-> re
+const gen = {re, t
+	match t#
+	`Alt	(a, b): genalt(re, a, b);;
+	`Cat	(a, b): gen(re, a); gen(re, b);;
+	/* repetition */
+	`Star	a:	genstar(re, a);;
+	`Plus	a:	gen(re, a); genstar(re, a);;
+	`Quest	a:	genquest(re, a);;
+
+	/* end matches */
+	`Class	sl:
+		std.die("Can't gen class\n")
+		;;
+	`Byte	b: 	append(re, `Ibyte b);;
+	`Chr	c:	genchar(re, c);;
+	`Dot: 		append(re, `Idot);;
+
+	/* meta */
+	`Cap	a:
+		std.put("WARNING: Capture not implemented")
+		gen(re, a)
+		;;
+	;;
+	-> re.proglen
 }
 
+const genalt = {re, l, r
+	var alt
+	var jmp
+	var l0
+	var l1
+	var l2
 
+	alt 	= re.proglen
+	l0	= append(re, `Ifork (-1, -1)) /* needs to be replaced */
+		  gen(re, l)
+	jmp	= re.proglen
+	l1 	= append(re, `Ijmp -1) /* needs to be replaced */
+	l2	= gen(re, l)
+
+	re.prog[alt] = `Ifork(l0, l1)
+	re.prog[jmp] = `Ijmp l2
+	-> re.proglen
+}
+
+const genstar = {re, rep
+	var alt
+	var jmp
+	var l0
+	var l1
+	var l2
+
+	l0 	= re.proglen
+	alt	= re.proglen
+	l1 	= append(re, `Ifork (-1, -1)) /* needs to be replaced */
+	jmp	= gen(re, rep)
+	l2	= append(re, `Ijmp -1)
+
+
+	re.prog[alt] = `Ifork (l1, l2)
+	re.prog[jmp] = `Ijmp l0
+	-> re.proglen
+}
+
+const genquest = {re, q
+	var alt
+	var l0
+	var l1
+
+	alt	= re.proglen
+	l0	= append(re, `Ifork (-1, -1)) /* needs to be replaced */
+	l1	= gen(re, q)
+	re.prog[alt] = `Ifork (l0, l1)
+	-> re.proglen
+}
+
+const genchar = {re, c
+	var b : byte[4]
+	var n
+	var i
+
+	n = std.encode(b[:], c)
+	for i = 0; i < n; i++
+		append(re, `Ibyte b[i])
+	;;
+	-> re.proglen
+}
+
+const append = {re, insn
+	if re.proglen == re.prog.len
+		re.prog = std.slgrow(re.prog, std.max(1, 2*re.proglen))
+	;;
+	re.prog[re.proglen] = insn
+	re.proglen++
+	-> re.proglen
+}
+
+const idump = {re
+	var i
+
+	for i = 0; i < re.proglen; i++
+		std.put("%i:\t", i)
+		match re.prog[i]
+		/* Char matching. Consume exactly one byte from the string. */
+		`Ibyte b: 
+			std.put("`Ibyte %b (%c)\n", b, b castto(char))
+			;;
+		`Irange (start, end):
+			std.put("`Irange (%b,%b)\n", start, end)
+			;;
+		`Idot:
+			std.put("`Idot\n")
+			;;
+		/*
+		Control flow. All of these recursively call step() until
+		exactly one byte is consumed from the string.
+		*/
+		`Ifork	(lip, rip):
+			std.put("`Ifork (%z,%z)\n", lip, rip)
+			;;
+		`Ijmp ip:
+			std.put("`Ijmp %z\n", ip)
+			;;
+		`Imatch:
+			std.put("`Imatch\n")
+			;;
+		;;
+	;;
+}
+
+const dump = {t, indent
+	var i
+
+	for i = 0; i < indent; i++
+		std.put(" ")
+	;;
+	match t#
+	`Alt	(a, b):
+		std.put("Alt\n")
+		dump(a, indent + 1)
+		dump(b, indent + 1)
+		;;
+	`Cat	(a, b):
+		std.put("Cat\n")
+		dump(a, indent + 1)
+		dump(b, indent + 1)
+		;;
+	/* repetition */
+	`Star	a:
+		std.put("Star\n")
+		dump(a, indent + 1)
+		;;
+	`Plus	a:
+		std.put("Plus\n")
+		dump(a, indent + 1)
+		;;
+	`Quest	a:
+		std.put("Quest\n")
+		dump(a, indent + 1)
+		;;
+	/* end matches */
+	`Class	sl:
+		std.put("Class [a..b]\n")
+		;;
+	`Byte	b:
+		std.put("Byte %b\n", b)
+		;;
+	`Chr	c:
+		std.put("Char %c\n", c)
+		;;
+	`Dot:
+		std.put("Dot\n")
+		;;
+
+	/* meta */
+	`Cap	a:
+		std.put("Cap\n")
+		dump(a, indent + 1)
+		;;
+	;;
+}
+
+const parse = {re
+	match altexpr(re)
+	`Some t:
+		if re.pat.len == 0
+			-> `Some t
+		else
+			free(t)
+			-> `Fail (`Earlystop)
+		;;
+		;;
+	`None:
+		-> `None
+		;;
+	;;
+}
+
+const altexpr = {re
+	var ret : tree#
+
+	std.put("-> alt\n")
+	match catexpr(re)
+	`Some t:
+		ret = t
+		if matchc(re, '|')
+			match altexpr(re)
+			`Some rhs:
+				ret = mk(`Alt (ret, rhs))
+				;;
+			`None:
+				free(ret)
+				-> `Fail (`Earlystop)
+				;;
+			`Fail f:
+				-> `Fail f
+				;;
+			;;
+
+		;;
+		;;
+	other:	-> other;;
+	;;
+	-> `Some ret
+}
+
+const catexpr = {re
+	var ret
+
+	std.put("-> cat\n")
+	match repexpr(re)
+	`Some t: 
+		ret = t
+		match catexpr(re)
+		`Some rhs:
+			ret = mk(`Cat (t, rhs))
+			;;
+		`Fail f: -> `Fail f;;
+		`None:	/* nothing */;;
+		;;
+		;;
+	other:	-> other;;
+	;;
+	-> `Some ret
+}
+
+const repexpr = {re
+	var ret
+
+	std.put("-> rep\n")
+	match baseexpr(re)
+	`Some t:
+		if matchc(re, '*')
+			ret = mk(`Star t)
+		elif matchc(re, '+')
+			ret = mk(`Plus t)
+		elif matchc(re, '?')
+			ret = mk(`Quest t)
+		else
+			ret = t
+		;;
+		;;
+	other:	-> other;;
+	;;
+	-> `Some ret
+}
+
+const baseexpr = {re
+	var ret
+
+	std.put("-> base\n")
+	if re.pat.len == 0
+		-> `None
+	;;
+	match getc(re)
+	'[':	ret = chrclass(re);;
+	'.':	ret = mk(`Dot);;
+	'^':	ret = mk(`Astart);;
+	'$':	ret = mk(`Aend);;
+	'(':	
+		match altexpr(re)
+		`Some s:	ret = mk(`Cap s);;
+		`None:	-> `Fail (`Emptyparen);;
+		;;
+		if !matchc(re, ')')
+			free(ret)
+			-> `Fail (`Unbalanced)
+		;;
+		;;
+	'*':	-> `Fail (`Badrep);;
+	'+':	-> `Fail (`Badrep);;
+	'?':	-> `Fail (`Badrep);;
+	c:
+		if c == '\\'
+			if re.pat.len == 0
+				-> `Fail (`Earlystop)
+			;;
+			c = getc(re)
+		;;
+		ret = mk(`Chr c)
+		;;
+	;;
+	std.put("<- base\n")
+	dump(ret, 0)
+	-> `Some ret
+}
+
+const chrclass = {pat
+	var t
+
+	t = std.alloc()
+	t# = `Class [][:]
+	-> t
+}
+
+const matchc = {re, c
+	var str
+	var chr
+
+	(chr, str) = std.striter(re.pat)
+	if chr != c
+		-> false
+	;;
+	re.pat = str
+	-> true
+}
+
+const getc = {re
+	var c
+
+	(c, re.pat) = std.striter(re.pat)
+	-> c
+}
+		
+
+const mk = {v
+	var t
+
+	t = std.alloc()
+	t# = v
+	-> t
+}
+
+const free = {t
+	match t#
+	`Alt	(a, b): free(a); free(b);;
+	`Cat	(a, b): free(a); free(b);;
+	/* repetition */
+	`Star	a:	free(a);;
+	`Plus	a:	free(a);;
+	`Quest	a:	free(a);;
+
+	/* end matches */
+	`Class	sl:	std.slfree(sl);;
+	`Byte	b:	;;
+	`Chr	c:	;;
+	`Dot:		;;
+
+	/* meta */
+	`Cap	a:	free(a);;
+	;;
+	std.free(t)
+}
--- a/interp.myr
+++ b/interp.myr
@@ -35,33 +35,36 @@
 	thr = re.thr[tid]
 	str = re.str
 	match re.prog[thr.ip]
-	/* Char matching */
-	`Byte b:
+	/* Char matching. Consume exactly one byte from the string. */
+	`Ibyte b:
 		if !in(re, str) || b != str[re.strp]
-			kill(re, tid, "not char")
+			kill(re, tid, "not right char")
 		else
 			std.put("matched %b with %b\n", b, str[re.strp])
 		;;
 		;;
-	`Range (start, end):
+	`Irange (start, end):
 		if !in(re, str) || start > str[re.strp] || end < str[re.strp]
 			kill(re, tid, "bad range")
 		;;
 		;;
-	`Dot:
+	`Idot:
 		if !in(re, str)
 			kill(re, tid, "past end")
 		;;
 		;;
-	/* Control flow */
-	`Split	(lip, rip):
+	/*
+	  Control flow. All of these recursively call step() until
+	  exactly one byte is consumed from the string.
+	 */
+	`Ifork	(lip, rip):
 		spawn(re, lip)
 		jmp(re, tid, rip)
 		;;
-	`Jmp ip:
+	`Ijmp ip:
 		jmp(re, tid, ip)
 		;;
-	`Match:
+	`Imatch:
 		finish(re, tid)
 		;;
 	;;
@@ -91,7 +94,6 @@
 
 	re.thr[re.nthr] = thr
 	re.nthr++
-
 	step(re, tid)
 }
 
--- a/main.myr
+++ b/main.myr
@@ -1,13 +1,13 @@
+use regex
 use std
 
-use regex
-
 const main = {
+	regex.compile("a+")
+	/*
 	var re
 	var found
-
-	re = regex.compile("a+")
 	found = regex.exec(re, "aaa")
 	std.put("Found: len = %z\n", re.strp, found)
+	*/
 }
 
--- a/types.myr
+++ b/types.myr
@@ -1,7 +1,18 @@
 use std
 
 pkg regex =
+	type status = union
+		`Earlystop
+		`Unbalanced
+		`Emptyparen
+		`Badrep
+		`Noimpl
+	;;
+
 	type regex = struct
+		pat	: byte[:]
+		/* VM state */
+		proglen	: std.size
 		prog	: reinst[:]
 		nthr	: std.size
 		thr	: rethread#[:]
@@ -17,13 +28,13 @@
 
 	type reinst = union
 		/* direct consumers */
-		`Byte	byte
-		`Range	[byte, byte]
-		`Dot
+		`Ibyte	byte
+		`Irange	[byte, byte]
+		`Idot
 
-		`Match	/* found the end of the expr */
+		`Imatch	/* found the end of the expr */
 
-		`Split	[std.size, std.size]
-		`Jmp	std.size
+		`Ifork	[std.size, std.size]
+		`Ijmp	std.size
 	;;
 ;;