shithub: purgatorio

ref: 38c0cf0737de03906729054ad7b7b011dd1ca475
dir: /appl/lib/xml.b/

View raw version
implement Xml;

#
# Portions copyright © 2002 Vita Nuova Holdings Limited
#
#
# Derived from saxparser.b Copyright © 2001-2002 by John Powers or his employer
#

# TO DO:
# - provide a way of getting attributes out of <?...?> (process) requests,
# so that we can process stylesheet requests given in that way.

include "sys.m";
	sys: Sys;
include "bufio.m";
	bufio: Bufio;
	Iobuf: import bufio;
include "string.m";
	str: String;
include "hash.m";
	hash: Hash;
	HashTable: import hash;
include "xml.m";

Parcel: adt {
	pick {
	Start or
	Empty =>
		name: string;
		attrs: Attributes;
	End =>
		name: string;
	Text =>
		ch: string;
		ws1, ws2: int;
	Process =>
		target: string;
		data: string;
	Error =>
		loc:	Locator;
		msg:	string;
	Doctype =>
		name:	string;
		public:	int;
		params:	list of string;
	Stylesheet =>
		attrs: Attributes;
	EOF =>
	}
};

entinit := array[] of {
	("AElig", "Æ"),
	("OElig", "Œ"),
	("aelig", "æ"),
	("amp", "&"),
	("apos", "\'"),
	("copy", "©"),
	("gt", ">"),
	("ldquo", "``"),
	("lt", "<"),
	("mdash", "-"),		# XXX ??
	("oelig", "œ"),
	("quot", "\""),
	("rdquo", "''"),
	("rsquo", "'"),
	("trade", "™"),
	("nbsp", "\u00a0"),
};
entdict: ref HashTable;

init(): string
{
	sys = load Sys Sys->PATH;
	bufio = load Bufio Bufio->PATH;
	if (bufio == nil)
		return sys->sprint("cannot load %s: %r", Bufio->PATH);
	str = load String String->PATH;
	if (str == nil)
		return sys->sprint("cannot load %s: %r", String->PATH);
	hash = load Hash Hash->PATH;
	if (hash == nil)
		return sys->sprint("cannot load %s: %r", Hash->PATH);
	entdict = hash->new(23);
	for (i := 0; i < len entinit; i += 1) {
		(key, value) := entinit[i];
		entdict.insert(key, (0, 0.0, value));
	}
	return nil;
}

blankparser: Parser;

open(srcfile: string, warning: chan of (Locator, string), preelem: string): (ref Parser, string)
{
	fd := bufio->open(srcfile, Bufio->OREAD);
	if(fd == nil)
		return (nil, sys->sprint("cannot open %s: %r", srcfile));
	return fopen(fd, srcfile, warning, preelem);
}

fopen(fd: ref Bufio->Iobuf, name: string, warning: chan of (Locator, string), preelem: string): (ref Parser, string)
{
	x := ref blankparser;
	x.in = fd;
	# ignore utf16 initialisation character (yuck)
	c := x.in.getc();
	if (c != 16rfffe && c != 16rfeff)
		x.in.ungetc();
	x.estack = nil;
	x.loc = Locator(1, name, "");
	x.warning = warning;
	x.preelem = preelem;
	return (x, "");
}

Parser.next(x: self ref Parser): ref Item
{
	curroffset := x.fileoffset;
	currloc := x.loc;
	# read up until end of current item
	while (x.actdepth > x.readdepth) {
		pick p := getparcel(x) {
		Start =>
			x.actdepth++;
		End =>
			x.actdepth--;
		EOF =>
			x.actdepth = 0;			# premature EOF closes all tags
		Error =>
			return ref Item.Error(curroffset, x.loc, x.errormsg);
		}
	}
	if (x.actdepth < x.readdepth) {
		x.fileoffset = int x.in.offset();
		return nil;
	}
	gp := getparcel(x);
	item: ref Item;
	pick p := gp {
	Start =>
		x.actdepth++;
		item = ref Item.Tag(curroffset, p.name, p.attrs);
	End =>
		x.actdepth--;
		item = nil;
	EOF =>
		x.actdepth = 0;
		item = nil;
	Error =>
		x.actdepth = 0;			# XXX is this the right thing to do?
		item = ref Item.Error(curroffset, currloc, x.errormsg);
	Text =>
		item = ref Item.Text(curroffset, p.ch, p.ws1, p.ws2);
	Process =>
		item = ref Item.Process(curroffset, p.target, p.data);
	Empty =>
		item = ref Item.Tag(curroffset, p.name, p.attrs);
	Doctype =>
		item = ref Item.Doctype(curroffset, p.name, p.public, p.params);
	Stylesheet =>
		item = ref Item.Stylesheet(curroffset, p.attrs);
	}
	x.fileoffset = int x.in.offset();
	return item;
}

Parser.atmark(x: self ref Parser, m: ref Mark): int
{
	return  int x.in.offset() == m.offset;
}

Parser.down(x: self ref Parser)
{
	x.readdepth++;
}

Parser.up(x: self ref Parser)
{
	x.readdepth--;
}

# mark is only defined after a next(), not after up() or down().
# this means that we don't have to record lots of state when going up or down levels.
Parser.mark(x: self ref Parser): ref Mark
{
	return ref Mark(x.estack, x.loc.line, int x.in.offset(), x.readdepth);
}

Parser.goto(x: self ref Parser, m: ref Mark)
{
	x.in.seek(big m.offset, Sys->SEEKSTART);
	x.fileoffset = m.offset;
	x.eof = 0;
	x.estack = m.estack;
	x.loc.line = m.line;
	x.readdepth = m.readdepth;
	x.actdepth = len x.estack;
}

Mark.str(m: self ref Mark): string
{
	# assume that neither the filename nor any of the tags contain spaces.
	# format:
	# offset readdepth linenum [tag...]
	# XXX would be nice if the produced string did not contain
	# any spaces so it could be treated as a word in other contexts.
	s := sys->sprint("%d %d %d", m.offset, m.readdepth, m.line);
	for (t := m.estack; t != nil; t = tl t)
		s += " " + hd t;
	return s;
}

Parser.str2mark(p: self ref Parser, s: string): ref Mark
{
	(n, toks) := sys->tokenize(s, " ");
	if (n < 3)
		return nil;
	m := ref Mark(nil, p.loc.line, 0, 0);
	(m.offset, toks) = (int hd toks, tl toks);
	(m.readdepth, toks) = (int hd toks, tl toks);
	(m.line, toks) = (int hd toks, tl toks);
	m.estack = toks;
	return m;
}

getparcel(x: ref Parser): ref Parcel
{
	{
		p: ref Parcel;
		while (!x.eof && p == nil) {
			c := getc(x);
			if (c == '<')
				p = element(x);
			else {
				ungetc(x);
				p = characters(x);
			}
		}
		if (p == nil)
			p = ref Parcel.EOF;
		return p;
	}exception e{
	"sax:*" =>
			return ref Parcel.Error(x.loc, x.errormsg);
	}
}

parcelstr(gi: ref Parcel): string
{
	if (gi == nil)
		return "nil";
	pick i := gi {
	Start =>
		return sys->sprint("Start: %s", i.name);
	Empty =>
		return sys->sprint("Empty: %s", i.name);
	End =>
		return "End";
	Text =>
		return "Text";
	Doctype =>
		return sys->sprint("Doctype: %s", i.name);
	Stylesheet =>
		return "Stylesheet";
	Error =>
		return "Error: " + i.msg;
	EOF =>
		return "EOF";
	* =>
		return "Unknown";
	}
}

element(x: ref Parser): ref Parcel
{
	# <tag ...>
	elemname := xmlname(x);
	c: int;
	if (elemname != "") {
		attrs := buildattrs(x);
		skipwhite(x);
		c = getc(x);
		isend := 0;
		if (c == '/')
			isend = 1;
		else
			ungetc(x);
		expect(x, '>');

		if (isend)
			return ref Parcel.Empty(elemname, attrs);
		else {
			startelement(x, elemname);
			return ref Parcel.Start(elemname, attrs);
		}
	# </tag>
	} else if ((c = getc(x)) == '/') {
		elemname = xmlname(x);
		if (elemname != "") {
			expect(x, '>');
			endelement(x, elemname);
			return ref Parcel.End(elemname);
		}
		else
			error(x, sys->sprint("illegal beginning of tag: '%c'", c));
	# <?tag ... ?>
	} else if (c == '?') {
		elemname = xmlname(x);
		if (elemname != "") {
			# this special case could be generalised if there were many
			# processing instructions that took attributes like this.
			if (elemname == "xml-stylesheet") {
				attrs := buildattrs(x);
				balancedstring(x, "?>");
				return ref Parcel.Stylesheet(attrs);
			} else {
				data := balancedstring(x, "?>");
				return ref Parcel.Process(elemname, data);
			}
		}
	} else if (c == '!') {
		c = getc(x);
		case c {
		'-' =>
			# <!-- comment -->
			if(getc(x) == '-'){
				balancedstring(x, "-->");
				return nil;
			}
		'[' =>
			# <![CDATA[...]]
			s := xmlname(x);
			if(s == "CDATA" && getc(x) == '['){
				data := balancedstring(x, "]]>");
				return ref Parcel.Text(data, 0, 0);
			}
		* =>
			# <!declaration
			ungetc(x);
			s := xmlname(x);
			case s {
			"DOCTYPE" =>
				# <!DOCTYPE name (SYSTEM "filename" | PUBLIC "pubid" "uri"?)? ("[" decls "]")?>
				skipwhite(x);
				name := xmlname(x);
				if(name == nil)
					break;
				id := "";
				uri := "";
				public := 0;
				skipwhite(x);
				case sort := xmlname(x) {
				"SYSTEM" =>
					id = xmlstring(x, 1);
				"PUBLIC" =>
					public = 1;
					id = xmlstring(x, 1);
					skipwhite(x);
					c = getc(x);
					ungetc(x);
					if(c == '"' || c == '\'')
						uri = xmlstring(x, 1);
				* =>
					error(x, sys->sprint("unknown DOCTYPE: %s", sort));
					return nil;
				}
				skipwhite(x);
				if(getc(x) == '['){
					error(x, "cannot handle DOCTYPE with declarations");
					return nil;
				}
				ungetc(x);
				skipwhite(x);
				if(getc(x) == '>')
					return ref Parcel.Doctype(name, public, id :: uri :: nil);
			"ELEMENT" or "ATTRLIST" or "NOTATION" or "ENTITY" =>
				# don't interpret internal DTDs
				# <!ENTITY name ("value" | SYSTEM "filename")>
				s = gets(x, '>');
				if(s == nil || s[len s-1] != '>')
					error(x, "end of file in declaration");
				return nil;
			* =>
				error(x, sys->sprint("unknown declaration: %s", s));
			}
		}
		error(x, "invalid XML declaration");
	} else
		error(x, sys->sprint("illegal beginning of tag: %c", c));
	return nil;
}

characters(x: ref Parser): ref Parcel
{
	p: ref Parcel;
	content := gets(x, '<');
	if (len content > 0) {
		if (content[len content - 1] == '<') {
			ungetc(x);
			content = content[0:len content - 1];
		}
		ws1, ws2: int;
		if (x.ispre) {
			content = substituteentities(x, content);
			ws1 = ws2 = 0;
		} else
			(content, ws1, ws2) = substituteentities_sp(x, content);
		if (content != nil || ws1)
			p = ref Parcel.Text(content, ws1, ws2);
	}
	return p;
}

startelement(x: ref Parser, name: string)
{
	x.estack = name :: x.estack;
	if (name == x.preelem)
		x.ispre++;
}

endelement(x: ref Parser, name: string)
{
	if (x.estack != nil && name == hd x.estack) {
		x.estack = tl x.estack;
		if (name == x.preelem)
			x.ispre--;
	} else {
		starttag := "";
		if (x.estack != nil)
			starttag = hd x.estack;
		warning(x, sys->sprint("<%s></%s> mismatch", starttag, name));

		# invalid XML but try to recover anyway to reduce turnaround time on fixing errors.
		# loop back up through the tag stack to see if there's a matching tag, in which case
		# jump up in the stack to that, making some rude assumptions about the
		# way Parcels are handled at the top level.
		n := 0;
		for (t := x.estack; t != nil; (t, n) = (tl t, n + 1))
			if (hd t == name)
				break;
		if (t != nil) {
			x.estack = tl t;
			x.actdepth -= n;
		}
	}
}

buildattrs(x: ref Parser): Attributes
{
	attrs: list of Attribute;

	attr: Attribute;
	for (;;) {
		skipwhite(x);
		attr.name = xmlname(x);
		if (attr.name == nil)
			break;
		skipwhite(x);
		c := getc(x);
		if(c != '='){
			ungetc(x);
			attr.value = nil;
		}else
			attr.value = xmlstring(x, 1);
		attrs = attr :: attrs;
	}
	return Attributes(attrs);
}

xmlstring(x: ref Parser, dosub: int): string
{
	skipwhite(x);
	s := "";
	delim := getc(x);
	if (delim == '\"' || delim == '\'') {
		s = gets(x, delim);
		n := len s;
		if (n == 0 || s[n-1] != delim)
			error(x, "unclosed string at end of file");
		s = s[0:n-1];	# TO DO: avoid copy
		if(dosub)
			s = substituteentities(x, s);
	} else
		error(x, sys->sprint("illegal string delimiter: %c", delim));
	return s;
}

xmlname(x: ref Parser): string
{
	name := "";
	ch := getc(x);
	case ch {
	'_' or ':' or
	'a' to 'z' or
	'A' to 'Z' or
	16r100 to 16rd7ff or
	16re000 or 16rfffd =>
		name[0] = ch;
loop:
		for (;;) {
			case ch = getc(x) {
			'_' or '-' or ':' or '.' or
			'a' to 'z' or
			'0' to '9' or
			'A' to 'Z' or
			16r100 to 16rd7ff or
			16re000 to 16rfffd =>
				name[len name] = ch;
			* =>
				break loop;
			}
		}
	}
	ungetc(x);
	return name;
}

substituteentities(x: ref Parser, buff: string): string
{
	i := 0;
	while (i < len buff) {
		if (buff[i] == '&') {
			(t, j) := translateentity(x, buff, i);
			# XXX could be quicker
			buff = buff[0:i] + t + buff[j:];
			i += len t;
		} else
			i++;
	}
	return buff;
}

# subsitute entities, squashing whitespace along the way.
substituteentities_sp(x: ref Parser, buf: string): (string, int, int)
{
	firstwhite := 0;
	# skip initial white space
	for (i := 0; i < len buf; i++) {
		c := buf[i];
		if (c != ' ' && c != '\t' && c != '\n' && c != '\r')
			break;
		firstwhite = 1;
	}

	lastwhite := 0;
	s := "";
	for (; i < len buf; i++) {
		c := buf[i];
		if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
			lastwhite = 1;
		else {
			if (lastwhite) {
				s[len s] = ' ';
				lastwhite = 0;
			}
			if (c == '&') {
				# should &x20; count as whitespace?
				(ent, j) := translateentity(x, buf, i);
				i = j - 1;
				s += ent;
			} else
				s[len s] = c;
		}
	}
	return (s, firstwhite, lastwhite);
}

translateentity(x: ref Parser, s: string, i: int): (string, int)
{
	i++;
	for (j := i; j < len s; j++)
		if (s[j] == ';')
			break;
	ent := s[i:j];
	if (j == len s) {
		if (len ent > 10)
			ent = ent[0:11] + "...";
		warning(x, sys->sprint("missing ; at end of entity (&%s)", ent));
		return (nil, i);
	}
	j++;
	if (ent == nil) {
		warning(x, "empty entity");
		return ("", j);
	}
	if (ent[0] == '#') {
		n: int;
		rem := ent;
		if (len ent >= 3 && ent[1] == 'x')
			(n, rem) = str->toint(ent[2:], 16);
		else if (len ent >= 2)
			(n, rem) = str->toint(ent[1:], 10);
		if (rem != nil) {
			warning(x, sys->sprint("unrecognized entity (&%s)", ent));
			return (nil, j);
		}
		ch: string = nil;
		ch[0] = n;
		return (ch, j);
	}
	hv := entdict.find(ent);
	if (hv == nil) {
		warning(x, sys->sprint("unrecognized entity (&%s)", ent));
		return (nil, j);
	}
	return (hv.s, j);
}

balancedstring(x: ref Parser, eos: string): string
{
	s := "";
	instring := 0;
	quote: int;

	for (i := 0; i < len eos; i++)
		s[len s] = ' ';

	skipwhite(x);
	while ((c := getc(x)) != Bufio->EOF) {
		s[len s] = c;
		if (instring) {
			if (c == quote)
				instring = 0;
		} else if (c == '\"' || c == '\'') {
			quote = c;
			instring = 1;
		} else if (s[len s - len eos : len s] == eos)
			return s[len eos : len s - len eos];
	}
	error(x, sys->sprint("unexpected end of file while looking for \"%s\"", eos));
	return "";
}

skipwhite(x: ref Parser)
{
	while ((c := getc(x)) == ' ' || c == '\t' || c == '\n' || c == '\r')
		;
	ungetc(x);
}

expectwhite(x: ref Parser)
{
	if ((c := getc(x)) != ' ' && c != '\t' && c != '\n' && c != '\r')
		error(x, "expecting white space");
	skipwhite(x);
}

expect(x: ref Parser, ch: int)
{
	skipwhite(x);
	c := getc(x);
	if (c != ch)
		error(x, sys->sprint("expecting %c", ch));
}

getc(x: ref Parser): int
{
	if (x.eof)
		return Bufio->EOF;
	ch := x.in.getc();
	if (ch == Bufio->EOF)
		x.eof = 1;
	else if (ch == '\n')
		x.loc.line++;
	x.lastnl = ch == '\n';
	return ch;
}

gets(x: ref Parser, delim: int): string
{
	if (x.eof)
		return "";
	s := x.in.gets(delim);
	for (i := 0; i < len s; i++)
		if (s[i] == '\n')
			x.loc.line++;
	if (s == "")
		x.eof = 1;
	else
		x.lastnl = s[len s - 1] == '\n';
	return s;
}

ungetc(x: ref Parser)
{
	if (x.eof)
		return;
	x.in.ungetc();
	x.loc.line -= x.lastnl;
}

Attributes.all(al: self Attributes): list of Attribute
{
	return al.attrs;
}

Attributes.get(attrs: self Attributes, name: string): string
{
	for (a := attrs.attrs; a != nil; a = tl a)
		if ((hd a).name == name)
			return (hd a).value;
	return nil;
}

warning(x: ref Parser, msg: string)
{
	if (x.warning != nil)
		x.warning <-= (x.loc, msg);
}

error(x: ref Parser, msg: string)
{
	x.errormsg = msg;
	raise "sax:error";
}