shithub: mc

Download patch

ref: 8bc429fb93070938b8bc5d459da4f2664c00abae
parent: 3a496c143eb8aad32c03c7067c54807256949c97
author: Ori Bernstein <[email protected]>
date: Mon Dec 28 20:02:54 EST 2015

Add initial tokenizer for myrddin parser.

--- /dev/null
+++ b/mparse/bld.proj
@@ -1,0 +1,7 @@
+bin tok =
+	main.myr
+	types.myr
+	tok.myr
+	tokdefs.myr
+	util.myr
+;;
--- /dev/null
+++ b/mparse/main.myr
@@ -1,0 +1,15 @@
+use std
+
+use "tok.use"
+
+const main = {
+	var ts
+
+	ts = parse.tokinitf(0)
+	while true
+		match parse.toknext(ts)
+		| `parse.Teof:	break
+		| tok:	std.put("{}\n", tok)
+		;;
+	;;
+}
--- /dev/null
+++ b/mparse/tok.myr
@@ -1,0 +1,540 @@
+use std
+
+use "types.use"
+use "tokdefs.use"
+use "util.use"
+
+pkg parse =
+	type tokstream = struct
+		next	: std.option(tok)
+		rest	: byte[:]
+		data	: byte[:]
+		loc	: srcloc
+	;;
+
+	const tokinit	: (path : byte[:]	-> tokstream#)
+	const tokinitf	: (path : std.fd	-> tokstream#)
+	const tokclose	: (ts : tokstream#	-> void)
+
+	const toknext	: (ts : tokstream#	-> tok)
+	const tokpeek	: (ts : tokstream#	-> tok)
+;;
+
+const Eof = std.Badchar
+
+const tokinit = {path
+	match std.slurp(path)
+	| `std.Ok data:	-> std.mk([.next=`std.None, .rest=data, .data=data])
+	| `std.Fail e:	std.fatal("could not read file {}: {}\n", path, e)
+	;;
+}
+
+const tokinitf = {fd
+	match std.fslurp(fd)
+	| `std.Ok data:	-> std.mk([.next=`std.None, .rest=data, .data=data])
+	| `std.Fail e:	std.fatal("could not read file {}: {}\n", fd, e)
+	;;
+}
+
+const tokclose = {ts
+	std.slfree(ts.data)
+	std.free(ts)
+}
+
+const toknext = {ts
+	var t
+	match ts.next
+	| `std.Some tok:
+		ts.next = `std.None
+		std.put("tok: {}\n", tok)
+		-> tok
+	| `std.None:
+		t = tokread(ts)
+		std.put("t: {}\n", t)
+		-> t
+	;;
+}
+
+const tokpeek = {ts
+	var tok
+
+	match ts.next
+	| `std.Some t:
+		-> t
+	| `std.None:
+		tok = tokread(ts)
+		ts.next = `std.Some tok
+		-> tok
+	;;
+}
+
+const tokread : (ts : tokstream# -> tok) = {ts
+	var c
+
+	skipspace(ts)
+	c = peekc(ts)
+	if ts.rest.len == 0
+		-> `Teof
+	elif c == '\n'
+		takec(ts)
+		ts.loc.line++
+		ts.loc.col = 1
+		-> `Tendln
+	elif c == '\''
+		-> chrlit(ts)
+	elif c == '"'
+		-> strlit(ts)
+	elif c == '@'
+		-> typaram(ts)
+	elif isident(c)
+		-> kwident(ts)
+	elif std.isdigit(c)
+		-> numlit(ts)
+	else
+		-> oper(ts)
+	;;
+}
+
+const skipspace = {ts
+	var ignorenl
+
+	ignorenl = false
+	while true
+		match peekc(ts)
+		| '\n':
+			if ignorenl
+				takec(ts)
+				ts.loc.line++
+				ts.loc.col = 1
+			else
+				break
+			;;
+		| '\\':
+			ignorenl = true
+			takec(ts)
+		| '/':
+			match npeekc(ts, 1)
+			| '/':	skipto(ts, '\n')
+			| '*':	skipcomment(ts)
+			| _:	break
+			;;
+		| c:
+			if std.isspace(c)
+				takec(ts)
+			else
+				break
+			;;
+		;;
+	;;
+}
+
+const skipcomment = {ts
+	var depth, startln
+
+	depth = 0
+	startln = ts.loc.line
+	while true
+		match takec(ts)
+		| '/':
+			if matchc(ts, '*')
+				depth++
+			;;
+		| '*':
+			if matchc(ts, '/')
+				depth--
+			;;
+		| '\n':
+			ts.loc.line++
+			ts.loc.col = 1
+		| Eof:
+			err(ts.loc, "file ended in comment starting on line {}\n", startln)
+		| _:
+		;;
+
+		if depth == 0
+			break
+		;;
+	;;
+}
+
+const chrlit = {ts
+	var c, close
+
+	takec(ts)
+	c = takec(ts)
+	if c == '\\'
+		c = unescape(ts)
+	;;
+	close = takec(ts)
+	if close != '\''
+		err(ts.loc, "expected closing ' in character literal, got {}\n", close)
+	;;
+	-> `Tchrlit c
+}
+
+const strlit = {ts
+	var sb
+
+	takec(ts)
+	sb = std.mksb()
+	while true
+		match takec(ts)
+		| Eof:
+			err(ts.loc, "unexpected EOF within string literal\n")
+		| '\n':
+			err(ts.loc, "unexpected \\n within string literal\n")
+		| '"':
+			break
+		| '\\':
+			std.sbputc(sb, unescape(ts))
+		| c:
+			std.sbputc(sb, c)
+		;;
+	;;
+	-> `Tstrlit std.sbfin(sb)
+}
+
+const unescape = {ts
+	var c, c1, c2
+
+	c = takec(ts)
+	/* we've already seen the '\' */
+	match c
+	| 'n':	-> '\n'
+	| 'r':	-> '\r'
+	| 't':	-> '\t'
+	| 'b':	-> '\b'
+	| '"':	-> '\"'
+	| '\'':	-> '\''
+	| 'v':	-> '\v'
+	| '\\':	-> '\\'
+	| '0':	-> '\0'
+	| 'u':	-> utfesc(ts);
+	| 'x':
+		c1 = takec(ts)
+		if !std.isxdigit(c1)
+			err(ts.loc, "expected hex digit, got {}\n", c1)
+		;;
+		c2 = takec(ts)
+		if !std.isxdigit(c2)
+			err(ts.loc, "expected hex digit, got {}\n", c2)
+		;;
+		-> 16*std.charval(c1, 16) + std.charval(c2, 16)
+
+		c2 = takec(ts)
+	| esc:
+		err(ts.loc, "unknown escape code \\{}\n", esc)
+	;;
+}
+
+const utfesc = {ts
+	var c, v
+
+	if takec(ts) != '{'
+		err(ts.loc, "\\u escape sequence without initial '{'\n")
+	;;
+	v = 0
+	c = std.Badchar
+	while true
+		c = takec(ts)
+		if std.isxdigit(c)
+			v *= 16
+			v += std.charval(c, 16)
+		else
+			break
+		;;
+		if v > 0x10FFFF
+			err(ts.loc, "invalid codepoint in \\u escape sequence\n")
+		;;
+	;;
+	if c != '}'
+		err(ts.loc, "\\u escape sequence without closing '{'\n")
+	;;
+	-> v
+}
+
+const typaram = {ts
+	takec(ts)
+	match kwident(ts)
+	| `Tident id:
+		-> `Ttyparam id
+	| kw:
+		err(ts.loc, "'{}' used as type parameter\n", kw)
+	;;
+
+}
+
+const numlit = {ts
+	var t
+
+	if matchc(ts, '0')
+		if matchc(ts, 'x')
+			t = number(ts, 16)
+		elif matchc(ts, 'b')
+			t = number(ts, 2)
+		elif matchc('o')
+			t = number(ts, 8)
+		else
+			t = number(ts, 10)
+		;;
+	else
+		t = number(ts, 10)
+	;;
+	-> t
+}
+
+/*
+only deals with the body of the number. if we reach
+this code, then it's guaranteed that we already have
+a numerical value.
+*/
+const number = {ts, base
+
+}
+
+const kwident = {ts
+	match identstr(ts)
+	| "$": 	-> `Tidxlen
+	| "_": 	-> `Tgap
+	| "$noret": 	-> `Tattr `Attrnoret
+	| "break": 	-> `Tbreak
+	| "castto": 	-> `Tcast
+	| "const": 	-> `Tconst
+	| "continue": 	-> `Tcontinue
+	| "elif": 	-> `Telif
+	| "else": 	-> `Telse
+	| "extern": 	-> `Tattr `Attrextern
+	| "false": 	-> `Tboollit false
+	| "for": 	-> `Tfor
+	| "generic": 	-> `Tgeneric
+	| "goto": 	-> `Tgoto
+	| "if": 	-> `Tif
+	| "impl": 	-> `Timpl
+	| "in": 	-> `Tin
+	| "match": 	-> `Tmatch
+	| "pkg": 	-> `Tpkg
+	| "pkglocal": 	-> `Tattr `Attrpkglocal
+	| "sizeof": 	-> `Tsizeof
+	| "struct": 	-> `Tstruct
+	| "trait": 	-> `Ttrait
+	| "true": 	-> `Tboollit true
+	| "type": 	-> `Ttype
+	| "union": 	-> `Tunion
+	| "use": 	-> `Tuse
+	| "var": 	-> `Tvar
+	| "void": 	-> `Tvoidlit
+	| "while": 	-> `Twhile
+	| ident:	-> `Tident ident
+	;;
+}
+
+const oper = {ts
+	var t, chr
+
+	chr = takec(ts)
+	std.put("c = '{}'\n", chr)
+	t = `Tobrace
+	match chr
+	| '{': t = `Tobrace
+	| '}': t = `Tcbrace
+	| '(': t = `Toparen
+	| ')': t = `Tcparen
+	| '[': t = `Tosqbrac
+	| ']': t = `Tcsqbrac
+	| ',': t = `Tcomma
+	| '`': t = `Ttick
+	| '#': t = `Tderef
+	| '~': t = `Tbnot
+	| ':':
+		if matchc(ts, ':')
+			t = `Twith
+		else
+			t = `Tcolon;
+		;;
+	| ';':
+		if matchc(ts, ';')
+			t = `Tendblk;
+		else
+			t = `Tendln;
+		;;
+	| '.':
+		if npeekc(ts, 1) == '.' && npeekc(ts, 2) == '.'
+			takec(ts)
+			takec(ts)
+			t = `Tellipsis;
+		else
+			t = `Tdot;
+		;;
+	| '+':
+		if matchc(ts, '=')
+			t = `Taddeq;
+		elif matchc(ts, '+')
+			t = `Tinc;
+		else
+			t = `Tplus;
+		;;
+	| '-':
+		if matchc(ts, '=')
+			t = `Tsubeq;
+		elif matchc(ts, '-')
+			t = `Tdec;
+		elif matchc(ts, '>')
+			t = `Tret;
+		else
+			t = `Tminus;
+		;;
+	| '*':
+		if matchc(ts, '=')
+			t = `Tmuleq;
+		else
+			t = `Tmul;
+		;;
+	| '/':
+		if matchc(ts, '=')
+			t = `Tdiveq;
+		else
+			t = `Tdiv;
+		;;
+	| '%':
+		if matchc(ts, '=')
+			t = `Tmodeq;
+		else
+			t = `Tmod;
+		;;
+	| '=':
+		if matchc(ts, '=')
+			t = `Teq;
+		else
+			t = `Tasn;
+		;;
+	| '|':
+		if matchc(ts, '=')
+			t = `Tboreq;
+		elif matchc(ts, '|')
+			t = `Tlor;
+		else
+			t = `Tbor;
+		;;
+	| '&':
+		if matchc(ts, '=')
+			t = `Tbandeq;
+		elif matchc(ts, '&')
+			t = `Tland;
+		else
+			t = `Tband;
+		;;
+	| '^':
+		if matchc(ts, '=')
+			t = `Tbxoreq;
+		else
+			t = `Tbxor;
+		;;
+	| '<':
+		if matchc(ts, '=')
+			t = `Tle;
+		elif matchc(ts, '<')
+			if matchc(ts, '=')
+				t = `Tbsleq;
+			else
+				t = `Tbsl;
+			;;
+		else
+			t = `Tlt;
+		;;
+	| '>':
+		if matchc(ts, '=')
+			t = `Tge;
+		elif matchc(ts, '>')
+			if matchc(ts, '=')
+				t = `Tbsreq;
+			else
+				t = `Tbsr;
+			;;
+		else
+			t = `Tgt;
+		;;
+
+	| '!':
+		if matchc(ts, '=')
+			t = `Tne;
+		else
+			t = `Tlnot;
+		;;
+	| c:
+		t = `Terror;
+		err(ts.loc, "junk character {}", c);
+	;;
+	-> t
+}
+
+const identstr = {ts
+	var i, str
+
+	/* ASCII */
+	if ts.rest.len == 0 || std.isdigit(ts.rest[0] castto(char))
+		-> ""
+	;;
+
+	for i = 0; i < ts.rest.len; i++
+		if !isident(ts.rest[i] castto(char))
+			break
+		;;
+	;;
+	str = ts.rest[:i]
+	ts.rest = ts.rest[i:]
+	-> std.sldup(str)
+}
+
+const isident = {c
+	-> c & 0x80 == 0 && \
+		(c >= 'a' && c <= 'z' || \
+		 c >= 'A' && c <= 'Z' || \
+		 c >= '0' && c <= '9' || \
+		 c == '_' || c == '$')
+}
+
+const peekc = {ts
+	-> std.decode(ts.rest)
+}
+
+const npeekc = {ts, n
+	var c, s
+
+	s = ts.rest
+	for var i = 0; i < n; i++
+		(c, s) = std.strstep(s)
+	;;
+	-> std.decode(s)
+}
+
+const takec = {ts
+	var c, s
+
+	(c, s) = std.strstep(ts.rest)
+	ts.rest = s
+	-> c
+}
+
+const skipto = {ts, chr
+	var c, s
+
+	s = ts.rest
+	while true
+		(c, s) = std.strstep(s)
+		if s.len == 0 || c == chr
+			break
+		;;
+	;;
+}
+
+const matchc = {ts, chr
+	var c, s
+
+	(c, s) = std.strstep(ts.rest)
+	if c == chr
+		ts.rest = s
+		-> true
+	else
+		-> false
+	;;
+}
--- /dev/null
+++ b/mparse/tokdefs.myr
@@ -1,0 +1,210 @@
+use std
+
+use "types.use"
+
+pkg parse =
+	type tok = union
+		`Terror
+		`Teof
+		`Tplus    /* + */
+		`Tminus   /* - */
+		`Tmul     /* * */
+		`Tdiv     /* / */
+		`Tinc     /* ++ */
+		`Tdec     /* -- */
+		`Tmod     /* % */
+		`Tasn     /* = */
+		`Taddeq   /* += */
+		`Tsubeq   /* -= */
+		`Tmuleq   /* *= */
+		`Tdiveq   /* /= */
+		`Tmodeq   /* %= */
+		`Tboreq   /* |= */
+		`Tbxoreq  /* ^= */
+		`Tbandeq  /* &= */
+		`Tbsleq   /* <<= */
+		`Tbsreq   /* >>= */
+		
+		`Tbor     /* | */
+		`Tbxor    /* ^ */
+		`Tband    /* & */
+		`Tbsl     /* << */
+		`Tbsr     /* >> */
+		`Tbnot    /* ~ */
+	
+		`Teq      /* == */
+		`Tgt      /* > */
+		`Tlt      /* < */
+		`Tge      /* >= */
+		`Tle      /* <= */
+		`Tne      /* != */
+	
+		`Tlor     /* || */
+		`Tland    /* && */
+		`Tlnot    /* ! */
+	
+		`Tobrace  /* { */
+		`Tcbrace  /* } */
+		`Toparen  /* ( */
+		`Tcparen  /* ) */
+		`Tosqbrac /* [ */
+		`Tcsqbrac /* ] */
+		`Tat      /* @ */
+		`Ttick    /* ` */
+		`Tderef   /* # */
+		`Tidxlen  /* $ */
+	
+		`Ttype    /* type */
+		`Tfor     /* for */
+		`Tin      /* in */
+		`Twhile   /* while */
+		`Tif      /* if */
+		`Telse    /* else */
+		`Telif    /* else */
+		`Tmatch   /* match */
+		`Tgoto    /* goto */
+		`Tbreak   /* break */
+		`Tcontinue   /* continue */
+	
+		`Tintlit int64
+		`Tstrlit byte[:]
+		`Tfltlit flt64
+		`Tchrlit char
+		`Tboollit bool
+		`Tvoidlit
+	
+		`Ttrait   /* trait */
+		`Timpl   /* trait */
+		`Tstruct  /* struct */
+		`Tunion   /* union */
+		`Ttyparam byte[:] /* @typename */
+	
+		`Tconst   /* const */
+		`Tvar     /* var */
+		`Tgeneric /* var */
+		`Tcast    /* castto */
+	
+		`Tgap     /* _ */
+		`Tellipsis/* ... */
+		`Tendln   /* ; or \n */
+		`Tendblk  /* ;; */
+		`Tcolon   /* : */
+		`Twith    /* :: */
+		`Tdot     /* . */
+		`Tcomma   /* , */
+		`Tret     /* -> */
+		`Tuse     /* use */
+		`Tpkg     /* pkg */
+		`Tsizeof  /* sizeof */
+		`Tattr attr   /* $attr */
+		`Tident byte[:]
+	;;
+;;
+
+const __init__ = {
+	var dummy : tok
+
+	dummy = `Terror
+	std.fmtinstall(std.typeof(dummy), tokfmt, [][:])
+}
+
+const tokfmt = {sb, ap, opts
+	var tok
+
+	tok = std.vanext(ap)
+	match tok
+	| `Terror:	std.sbfmt(sb, "ERROR")
+	| `Teof:	std.sbfmt(sb, "EOF")
+	| `Tplus:	std.sbfmt(sb, "+")
+	| `Tminus:	std.sbfmt(sb, "-")
+	| `Tmul:	std.sbfmt(sb, "*")
+	| `Tdiv:	std.sbfmt(sb, "/")
+	| `Tinc:	std.sbfmt(sb, "++")
+	| `Tdec:	std.sbfmt(sb, "--")
+	| `Tmod:	std.sbfmt(sb, "%")
+	| `Tasn:	std.sbfmt(sb, "=")
+	| `Taddeq:	std.sbfmt(sb, "+=")
+	| `Tsubeq:	std.sbfmt(sb, "-=")
+	| `Tmuleq:	std.sbfmt(sb, "*=")
+	| `Tdiveq:	std.sbfmt(sb, "/=")
+	| `Tmodeq:	std.sbfmt(sb, "%=")
+	| `Tboreq:	std.sbfmt(sb, "|=")
+	| `Tbxoreq:	std.sbfmt(sb, "^=")
+	| `Tbandeq:	std.sbfmt(sb, "&=")
+	| `Tbsleq:	std.sbfmt(sb, "<<=")
+	| `Tbsreq:	std.sbfmt(sb, ">>=")
+	| `Tbor:	std.sbfmt(sb, "|")
+	| `Tbxor:	std.sbfmt(sb, "^")
+	| `Tband:	std.sbfmt(sb, "&")
+	| `Tbsl:	std.sbfmt(sb, "<<")
+	| `Tbsr:	std.sbfmt(sb, ">>")
+	| `Tbnot:	std.sbfmt(sb, "~")
+
+	| `Teq:		std.sbfmt(sb, "==")
+	| `Tgt:		std.sbfmt(sb, ">")
+	| `Tlt:		std.sbfmt(sb, "<")
+	| `Tge:		std.sbfmt(sb, ">=")
+	| `Tle:		std.sbfmt(sb, "<=")
+	| `Tne:		std.sbfmt(sb, "!=")
+
+	| `Tlor:	std.sbfmt(sb, "||")
+	| `Tland:	std.sbfmt(sb, "&&")
+	| `Tlnot:	std.sbfmt(sb, "!")
+
+	| `Tobrace:	std.sbfmt(sb, "{{")
+	| `Tcbrace:	std.sbfmt(sb, "}}")
+	| `Toparen:	std.sbfmt(sb, "(")
+	| `Tcparen:	std.sbfmt(sb, ")")
+	| `Tosqbrac:	std.sbfmt(sb, "[")
+	| `Tcsqbrac:	std.sbfmt(sb, "]")
+	| `Tat:		std.sbfmt(sb, "@")
+	| `Ttick:	std.sbfmt(sb, "`")
+	| `Tderef:	std.sbfmt(sb, "#")
+	| `Tidxlen:	std.sbfmt(sb, "$")
+
+	| `Ttype:	std.sbfmt(sb, "type")
+	| `Tfor:	std.sbfmt(sb, "for")
+	| `Tin:		std.sbfmt(sb, "in")
+	| `Twhile:	std.sbfmt(sb, "while")
+	| `Tif:		std.sbfmt(sb, "if")
+	| `Telse:	std.sbfmt(sb, "else")
+	| `Telif:	std.sbfmt(sb, "else")
+	| `Tmatch:	std.sbfmt(sb, "match")
+	| `Tgoto:	std.sbfmt(sb, "goto")
+	| `Tbreak:	std.sbfmt(sb, "break")
+	| `Tcontinue:	std.sbfmt(sb, "continue")
+
+	| `Tintlit v:	std.sbfmt(sb, "{}", v)
+	| `Tstrlit v:	std.sbfmt(sb, "{e}", v)
+	| `Tfltlit v:	std.sbfmt(sb, "{}", v)
+	| `Tchrlit v:	std.sbfmt(sb, "{}", v)
+	| `Tboollit v:	std.sbfmt(sb, "{}", v)
+	| `Tvoidlit:	std.sbfmt(sb, "void")
+
+	| `Ttrait:	std.sbfmt(sb, "trait")
+	| `Timpl:	std.sbfmt(sb, "trait")
+	| `Tstruct:	std.sbfmt(sb, "struct")
+	| `Tunion:	std.sbfmt(sb, "union")
+	| `Ttyparam tp:	std.sbfmt(sb, "@{}", tp)
+
+	| `Tconst:	std.sbfmt(sb, "const")
+	| `Tvar:	std.sbfmt(sb, "var")
+	| `Tgeneric:	std.sbfmt(sb, "var")
+	| `Tcast:	std.sbfmt(sb, "castto")
+	| `Tgap:	std.sbfmt(sb, "_")
+
+	| `Tellipsis:	std.sbfmt(sb, "...")
+	| `Tendln:	std.sbfmt(sb, ";")
+	| `Tendblk:	std.sbfmt(sb, ";;")
+	| `Tcolon:	std.sbfmt(sb, ":")
+	| `Twith:	std.sbfmt(sb, "::")
+	| `Tdot:	std.sbfmt(sb, ".")
+	| `Tcomma:	std.sbfmt(sb, ",")
+	| `Tret:	std.sbfmt(sb, "->")
+	| `Tuse:	std.sbfmt(sb, "use")
+	| `Tpkg:	std.sbfmt(sb, "pkg")
+	| `Tattr a:	std.sbfmt(sb, "{}", a)
+	| `Tsizeof:	std.sbfmt(sb, "sizeof")
+	| `Tident str:	std.sbfmt(sb, "{}", str)
+	;;
+}
--- /dev/null
+++ b/mparse/types.myr
@@ -1,0 +1,13 @@
+pkg parse =
+	type srcloc = struct
+		file	: byte[:]
+		line	: int
+		col	: int
+	;;
+
+	type attr = union
+		`Attrpkglocal
+		`Attrextern
+		`Attrnoret
+	;;
+;;
--- /dev/null
+++ b/mparse/util.myr
@@ -1,0 +1,14 @@
+use std
+
+use "types.use"
+
+pkg parse =
+	$noret const err	: (loc : srcloc, msg : byte[:], args : ... -> void)
+	$noret const verr	: (loc : srcloc, msg : byte[:], args : std.valist -> void)
+;;
+
+const err = {loc, msg, args
+}
+
+const verr = {loc, msg, ap
+}