ref: 1f7f58d6a483853714d65099f32a916a92b18197
parent: 4db4f4592a0afaaeb5a9d4217359776c7139e949
author: Ori Bernstein <[email protected]>
date: Fri Jan 24 20:08:54 EST 2014
Add unusably slow unicode character classes So slow that I can't even reasonably add a test. Ugh. Need to optimize it.
--- a/compile.myr
+++ b/compile.myr
@@ -439,12 +439,11 @@
if re.pat.len == 0
-> `Fail (`Earlystop)
;;
- ret = escaped(re)
+ -> escaped(re)
| c:
getc(re)
ret = mk(`Chr c)
;;
- dump(re, ret, 0)
-> `Some ret
}
@@ -453,30 +452,81 @@
match getc(re)
/* character classes */
- | 'd': ret = ranges(re, _ranges.tabasciidigit[:])
- | 'x': ret = ranges(re, _ranges.tabasciixdigit[:])
- | 's': ret = ranges(re, _ranges.tabasciispace[:])
- | 'w': ret = ranges(re, _ranges.tabasciiword[:])
- | 'h': ret = ranges(re, _ranges.tabasciiblank[:])
+ | 'd': ret = `Some ranges(re, _ranges.tabasciidigit[:])
+ | 'x': ret = `Some ranges(re, _ranges.tabasciixdigit[:])
+ | 's': ret = `Some ranges(re, _ranges.tabasciispace[:])
+ | 'w': ret = `Some ranges(re, _ranges.tabasciiword[:])
+ | 'h': ret = `Some ranges(re, _ranges.tabasciiblank[:])
- /* escaped metachars */
- | '^': ret = mk(`Chr '^')
- | '$': ret = mk(`Chr '$')
- | '.': ret = mk(`Chr '.')
- | '+': ret = mk(`Chr '+')
- | '?': ret = mk(`Chr '?')
+ /* negated character classes */
+ | 'W': ret = `Some negranges(re, _ranges.tabasciiword[:])
+ | 'S': ret = `Some negranges(re, _ranges.tabasciispace[:])
+ | 'D': ret = `Some negranges(re, _ranges.tabasciidigit[:])
+ | 'X': ret = `Some negranges(re, _ranges.tabasciixdigit[:])
+ | 'H': ret = `Some negranges(re, _ranges.tabasciiblank[:])
- | 'W': ret = negranges(re, _ranges.tabasciiword[:])
- | 'S': ret = negranges(re, _ranges.tabasciispace[:])
- | 'D': ret = negranges(re, _ranges.tabasciidigit[:])
- | 'X': ret = negranges(re, _ranges.tabasciixdigit[:])
- | 'H': ret = negranges(re, _ranges.tabasciiblank[:])
- /*
- | 'p': unicodeclass(re)
- | 'P': negate(unicodeclass(re))
- */
+ /* unicode character classes */
+ | 'p': ret = unicodeclass(re, false)
+ | 'P': ret = unicodeclass(re, true)
+
+ /* escaped metachars */
+ | '^': ret = `Some mk(`Chr '^')
+ | '$': ret = `Some mk(`Chr '$')
+ | '.': ret = `Some mk(`Chr '.')
+ | '+': ret = `Some mk(`Chr '+')
+ | '?': ret = `Some mk(`Chr '?')
;;
-> ret
+}
+
+const unicodeclass = {re, neg
+ var c, s
+ var tab
+ var n
+
+ if re.pat.len == 0
+ -> `Fail (`Earlystop)
+ ;;
+ n = 0
+ s = re.pat
+ /* either a single char pattern, or {pat} */
+ match getc(re)
+ | '{':
+ while re.pat.len > 0
+ c = getc(re)
+ if c == '}'
+ break
+ ;;
+ n += std.charlen(c)
+ ;;
+ | r:
+ n += std.charlen(r)
+ ;;
+ s = s[:n]
+ /* letters */
+ if std.sleq(s, "L") || std.sleq(s, "Letter")
+ tab = _ranges.tabalpha[:]
+ elif std.sleq(s, "Lu") || std.sleq(s, "Uppercase_Letter")
+ tab = _ranges.tabupper[:]
+ elif std.sleq(s, "Ll") || std.sleq(s, "Lowercase_Letter")
+ tab = _ranges.tablower[:]
+ elif std.sleq(s, "Lt") || std.sleq(s, "Titlecase_Letter")
+ tab = _ranges.tablower[:]
+ /* numbers (incomplete) */
+ elif std.sleq(s, "N") || std.sleq(s, "Number")
+ tab = _ranges.tabdigit[:]
+ elif std.sleq(s, "Z") || std.sleq(s, "Separator")
+ tab = _ranges.tabspace[:]
+ elif std.sleq(s, "Zs") || std.sleq(s, "Space_Separator")
+ tab = _ranges.tabblank[:]
+ else
+ -> `Fail (`Badrange)
+ ;;
+ if !neg
+ -> `Some ranges(re, tab)
+ else
+ -> `Some negranges(re, tab)
+ ;;
}
const chrclass = {re
--- a/types.myr
+++ b/types.myr
@@ -6,6 +6,7 @@
`Unbalanced
`Emptyparen
`Badrep
+ `Badrange
`Noimpl
;;