ref: 921fafdfaf1319d022fac101dd021fdd2eaea8cd
parent: 95d3a409233b94ff479d8b2529582d144fd172b3
parent: acd48d382119e8d1dbf58b827c809b61bf5d0ecd
author: Ori Bernstein <[email protected]>
date: Mon Oct 28 12:01:27 EDT 2013
Merge branch 'master' of git+ssh://git.eigenstate.org/git/ori/libregex Conflicts: compile.myr
--- a/compile.myr
+++ b/compile.myr
@@ -19,8 +19,7 @@
/* end matches */
`Byte byte
`Chr char
- `Class [char, char][:]
- `Dot
+ `Class [char, char]
/* meta */
`Cap tree#
@@ -41,9 +40,9 @@
re.pat = pat
re.nmatch = 1 /* whole match */
match parse(re)
- `None: -> `std.Failure (`Earlystop);;
- `Fail f: -> `std.Failure f;;
- `Some t:
+ | `None: -> `std.Failure (`Earlystop)
+ | `Fail f: -> `std.Failure f
+ | `Some t:
dump(t, 0)
append(re, `Ilbra 0)
gen(re, t)
@@ -51,7 +50,6 @@
append(re, `Imatch)
idump(re)
-> `std.Success re
- ;;
;;
-> `std.Failure (`Noimpl)
}
@@ -60,84 +58,88 @@
var m
match t#
- `Alt (a, b): genalt(re, a, b);;
- `Cat (a, b): gen(re, a); gen(re, b);;
+ |`Alt (a, b): genalt(re, a, b)
+ |`Cat (a, b): gen(re, a); gen(re, b)
/* repetition */
- `Star a: genstar(re, a);;
- `Plus a: gen(re, a); genstar(re, a);;
- `Quest a: genquest(re, a);;
+ |`Star a: genstar(re, a)
+ |`Plus a: gen(re, a); genstar(re, a)
+ |`Quest a: genquest(re, a)
/* end matches */
- `Class sl:
- std.die("Can't gen class\n")
- ;;
- `Byte b: append(re, `Ibyte b);;
- `Chr c: genchar(re, c);;
- `Dot:
- genutfrange(re, 0, std.Maxcharval)
- ;;
+ |`Byte b: append(re, `Ibyte b)
+ |`Chr c: genchar(re, c)
+ |`Class (a, b): genrange(re, a, b)
/* meta */
- `Bol:
- append(re, `Ibol)
- ;;
- `Eol:
- append(re, `Ibol)
- ;;
- `Cap a:
+ |`Bol: append(re, `Ibol)
+ |`Eol: append(re, `Ibol)
+ |`Cap a:
m = re.nmatch++
append(re, `Ilbra m)
gen(re, a)
append(re, `Irbra m)
- ;;
;;
-> re.proglen
}
-const genutfrange = {re, start, end
- var ranges = [
- (0,0x7f),
- (0x80,0x7ff),
- (0x800,0xffff),
- (0x10000,0x1FFFFF)
+const genrange = {re, lo, hi
+ var charrng = [
+ 0,
+ 0x80,
+ 0x800,
+ 0x10000,
+ 0x200000,
+ -1
]
- var startbuf : byte[4]
- var endbuf : byte[4]
- var szstart
- var szend
+ var lbuf : byte[4]
+ var hbuf : byte[4]
+ var lsz
+ var hsz
+ var end
+ var sz
+ var d
var i
var j
- var lo
- var hi
- szstart = std.charlen(start)
- szend = std.charlen(end)
- /*
- single byte characters can just be treated as a byte match, no
- need for branching.
- */
- if szstart == szend
- for i = 0; i < szstart; i++
- append(re, `Irange (startbuf[i], endbuf[i]))
- ;;
+ lsz = std.charlen(lo)
+ hsz = std.charlen(hi)
+ charrng[lsz - 1] = lo
+ charrng[hsz] = hi
+ if lsz == 1 && hsz == 1
+ append(re, `Irange (lo castto(byte), hi castto(byte)))
else
- for i = 0; i < (szend - szstart); i++
- append(re, `Ifork (i + 1, -1)) /* replace */
+ for i = hsz; i > lsz; i--
+ std.put("i = %z\n", i - 2)
+ d = re.proglen + i - 1
+ append(re, `Ifork (re.proglen + 1, jmpdist(i) + d))
;;
- for i = szstart; i < szend; i++
- (lo, hi) = ranges[i]
- lo = std.max(lo, start)
- hi = std.min(hi, end)
- std.encode(startbuf[:], start)
- std.encode(endbuf[:], end)
- for j = 0; j < i; j++
- append(re, `Irange (startbuf[i], endbuf[i]))
+ end = re.proglen + jmpdist(hsz + 1);
+ for i = 0; i < hsz; i++
+ std.put("lo[%z] = %i\n", i, charrng[i] castto(int))
+ std.put("hi[%z] = %i\n", i, (charrng[i + 1] - 1) castto(int))
+
+ sz = std.encode(lbuf[:], charrng[i])
+ std.encode(hbuf[:], charrng[i + 1] - 1)
+ for j = 0; j < sz; j++
+ append(re, `Irange (lbuf[j], hbuf[j]))
;;
- append(re, `Ijmp -1)
+ append(re, `Ijmp (end))
;;
;;
+ -> re.proglen
}
+const jmpdist = {n
+ var d
+ var i
+
+ d = n - 1
+ for i = n - 1; i > 0; i--
+ d += i
+ ;;
+ -> d
+}
+
const genalt = {re, l, r
var alt
var jmp
@@ -216,19 +218,19 @@
std.put("%i:\t", i)
match re.prog[i]
/* Char matching. Consume exactly one byte from the string. */
- `Ibyte b: std.put("`Ibyte %b (%c)\n", b, b castto(char)) ;;
- `Irange (start, end): std.put("`Irange (%b,%b)\n", start, end) ;;
- `Idot: std.put("`Idot\n") ;;
+ | `Ibyte b: std.put("`Ibyte %b (%c)\n", b, b castto(char))
+ | `Irange (start, end): std.put("`Irange (%b,%b)\n", start, end)
+ | `Idot: std.put("`Idot\n")
/* capture groups */
- `Ilbra m: std.put("`Ilbra %z\n", m) ;;
- `Irbra m: std.put("`Irbra %z\n", m) ;;
+ | `Ilbra m: std.put("`Ilbra %z\n", m)
+ | `Irbra m: std.put("`Irbra %z\n", m)
/* anchors */
- `Ibol: std.put("`Ibol\n");;
- `Ieol: std.put("`Ieol\n");;
+ | `Ibol: std.put("`Ibol\n")
+ | `Ieol: std.put("`Ieol\n")
/* control flow */
- `Ifork (lip, rip): std.put("`Ifork (%z,%z)\n", lip, rip) ;;
- `Ijmp ip: std.put("`Ijmp %z\n", ip) ;;
- `Imatch: std.put("`Imatch\n") ;;
+ | `Ifork (lip, rip): std.put("`Ifork (%z,%z)\n", lip, rip)
+ | `Ijmp ip: std.put("`Ijmp %z\n", ip)
+ | `Imatch: std.put("`Imatch\n")
;;
;;
}
@@ -240,60 +242,46 @@
std.put(" ")
;;
match t#
- `Alt (a, b):
+ | `Alt (a, b):
std.put("Alt\n")
dump(a, indent + 1)
dump(b, indent + 1)
- ;;
- `Cat (a, b):
+ | `Cat (a, b):
std.put("Cat\n")
dump(a, indent + 1)
dump(b, indent + 1)
- ;;
/* repetition */
- `Star a:
+ | `Star a:
std.put("Star\n")
dump(a, indent + 1)
- ;;
- `Plus a:
+ | `Plus a:
std.put("Plus\n")
dump(a, indent + 1)
- ;;
- `Quest a:
+ | `Quest a:
std.put("Quest\n")
dump(a, indent + 1)
- ;;
- `Bol:
+ | `Bol:
std.put("Bol\n")
- ;;
- `Eol:
+ | `Eol:
std.put("Eol\n")
- ;;
/* end matches */
- `Class sl:
- std.put("Class [a..b]\n")
- ;;
- `Byte b:
+ | `Byte b:
std.put("Byte %b\n", b)
- ;;
- `Chr c:
+ | `Chr c:
std.put("Char %c\n", c)
- ;;
- `Dot:
- std.put("Dot\n")
- ;;
+ | `Class (a, b):
+ std.put("Class (%c-%c)\n", a, b)
/* meta */
- `Cap a:
+ | `Cap a:
std.put("Cap\n")
dump(a, indent + 1)
- ;;
;;
}
const parse = {re
match altexpr(re)
- `Some t:
+ | `Some t:
if re.pat.len == 0
-> `Some t
else
@@ -300,10 +288,8 @@
free(t)
-> `Fail (`Earlystop)
;;
- ;;
- `None:
+ | `None:
-> `None
- ;;
;;
}
@@ -311,24 +297,21 @@
var ret : tree#
match catexpr(re)
- `Some t:
+ | `Some t:
ret = t
if matchc(re, '|')
match altexpr(re)
- `Some rhs:
+ | `Some rhs:
ret = mk(`Alt (ret, rhs))
- ;;
- `None:
+ | `None:
free(ret)
-> `Fail (`Earlystop)
- ;;
- `Fail f:
+ | `Fail f:
-> `Fail f
- ;;
;;
;;
- ;;
- other: -> other;;
+ | other:
+ -> other
;;
-> `Some ret
}
@@ -337,17 +320,16 @@
var ret
match repexpr(re)
- `Some t:
+ | `Some t:
ret = t
match catexpr(re)
- `Some rhs:
+ | `Some rhs:
ret = mk(`Cat (t, rhs))
- ;;
- `Fail f: -> `Fail f;;
- `None: /* nothing */;;
+ | `Fail f: -> `Fail f
+ | `None: /* nothing */
;;
- ;;
- other: -> other;;
+ | other:
+ -> other
;;
-> `Some ret
}
@@ -356,7 +338,7 @@
var ret
match baseexpr(re)
- `Some t:
+ | `Some t:
if matchc(re, '*')
ret = mk(`Star t)
elif matchc(re, '+')
@@ -366,8 +348,8 @@
else
ret = t
;;
- ;;
- other: -> other;;
+ | other:
+ -> other
;;
-> `Some ret
}
@@ -380,27 +362,26 @@
;;
match peekc(re)
/* lower prec operators */
- '|': -> `None;;
- ')': -> `None;;
- '*': -> `Fail (`Badrep);;
- '+': -> `Fail (`Badrep);;
- '?': -> `Fail (`Badrep);;
- '[': ret = chrclass(re);;
- '.': getc(re); ret = mk(`Dot);;
- '^': getc(re); ret = mk(`Bol);;
- '$': getc(re); ret = mk(`Eol);;
- '(':
+ | '|': -> `None
+ | ')': -> `None
+ | '*': -> `Fail (`Badrep)
+ | '+': -> `Fail (`Badrep)
+ | '?': -> `Fail (`Badrep)
+ | '[': -> chrclass(re)
+ | '.': getc(re); ret = mk(`Class (0, std.Maxcharval))
+ | '^': getc(re); ret = mk(`Bol)
+ | '$': getc(re); ret = mk(`Eol)
+ | '(':
getc(re)
match altexpr(re)
- `Some s: ret = mk(`Cap s);;
- `None: -> `Fail (`Emptyparen);;
+ | `Some s: ret = mk(`Cap s)
+ | `None: -> `Fail (`Emptyparen)
;;
if !matchc(re, ')')
free(ret)
-> `Fail (`Unbalanced)
;;
- ;;
- c:
+ | c:
getc(re)
if c == '\\'
if re.pat.len == 0
@@ -409,20 +390,42 @@
c = getc(re)
;;
ret = mk(`Chr c)
- ;;
;;
dump(ret, 0)
-> `Some ret
}
-const chrclass = {pat
+const chrclass = {re
+ var r
var t
- t = std.alloc()
- t# = `Class [][:]
- -> t
+ matchc(re, '[')
+ t = rangematch(re)
+ while peekc(re) != ']'
+ r = rangematch(re)
+ t = mk(`Alt (t, r))
+ ;;
+ if !matchc(re, ']')
+ free(t)
+ -> `Fail (`Earlystop)
+ else
+ -> `Some t
+ ;;
}
+const rangematch = {re
+ var lo
+ var hi
+
+ lo = getc(re)
+ if matchc(re, '-')
+ hi = getc(re)
+ -> mk(`Class (lo, hi))
+ else
+ -> mk(`Chr lo)
+ ;;
+}
+
const matchc = {re, c
var str
var chr
@@ -460,21 +463,20 @@
const free = {t
match t#
- `Alt (a, b): free(a); free(b);;
- `Cat (a, b): free(a); free(b);;
+ | `Alt (a, b): free(a); free(b)
+ | `Cat (a, b): free(a); free(b)
/* repetition */
- `Star a: free(a);;
- `Plus a: free(a);;
- `Quest a: free(a);;
+ | `Star a: free(a)
+ | `Plus a: free(a)
+ | `Quest a: free(a)
/* end matches */
- `Class sl: std.slfree(sl);;
- `Byte b: ;;
- `Chr c: ;;
- `Dot: ;;
+ | `Byte b:
+ | `Chr c:
+ | `Class (a, b):
/* meta */
- `Cap a: free(a);;
+ | `Cap a: free(a)
;;
std.free(t)
}
--- a/main.myr
+++ b/main.myr
@@ -3,9 +3,9 @@
const main = {
var found
- match regex.compile("b*\n^a*")
+ match regex.compile("[a-z0-9]*bc")
`std.Success re:
- found = regex.exec(re, "b\naaa")
+ found = regex.exec(re, "a123bc")
std.put("Found = %t: len = %z\n", found, re.strp)
-> 0
;;