shithub: mc

Download patch

ref: 1708e4bc0d9d199b17054a5837e89fabfdc4298e
parent: 1018f0d5c592cbe9c2d78b7197369f422ffa26b0
parent: 06f83c93631d8af9eb25199a8229183c48b89ce8
author: Ori Bernstein <[email protected]>
date: Sat Oct 27 10:54:44 EDT 2018

Merge branch 'master' of git+ssh://git.eigenstate.org/git/ori/mc

--- a/bld.tags
+++ b/bld.tags
@@ -1,3 +1,7 @@
+fsbase: freebsd
+fsbase: linux
+fsbase: netbsd
+fsbase: openbsd
 futex: freebsd
 futex: linux
 futex: openbsd:6.2
--- a/lib/sys/sys+freebsd-x64.myr
+++ b/lib/sys/sys+freebsd-x64.myr
@@ -38,6 +38,7 @@
 	type cpulevel	= int
 	type cpusetid	= int
 	type idtype	= int
+	type sysarchop	= int64
 	
 	type acltype	= int
 	type acltag	= uint32
@@ -802,6 +803,13 @@
 	const Siglwp	: signo = Sigthr
 	const Siglibrt	: signo = 33	/* reserved by real-time library. */
 	
+	/* sysarch ops */
+	const Archamd64getfs   : sysarchop = 128
+	const Archamd64setfs   : sysarchop = 129
+	const Archamd64getgs   : sysarchop = 130
+	const Archamd64setgs   : sysarchop = 131
+	const Archamd64getxfpu : sysarchop = 132
+	
 	extern const syscall : (sc:scno, args:... -> int64)
 	extern var __cenvp : byte##
 	
@@ -1285,7 +1293,7 @@
 	const quotactl			:  (path : byte#, cmd : int, uid : int, arg : void# -> int)
 	const lgetfh			:  (fname : byte#, fhp : fhandle# -> int)
 	const getfh			:  (fname : byte#, fhp : fhandle# -> int)
-	const sysarch			:  (op : int, parms : byte# -> int)
+	const sysarch			:  (op : sysarchop, parms : void## -> int)
 	const rtprio			:  (function : int, pid : pid, rtp : rtprio# -> int)
 	const setfib			:  (fibnum : int -> int)
 	const ntp_adjtime		:  (tp : timex# -> int)
@@ -1969,7 +1977,7 @@
 	 -> (syscall(Sysgetfh, a(fname), a(fhp)) : int)
 }
 const sysarch	= {op, parms
-	 -> (syscall(Syssysarch, a(op), a(parms)) : int)
+	 -> (syscall(Syssysarch, op, a(parms)) : int)
 }
 const rtprio	= {function, pid, rtp
 	 -> (syscall(Sysrtprio, a(function), a(pid), a(rtp)) : int)
--- a/lib/sys/sys+linux-x64.myr
+++ b/lib/sys/sys+linux-x64.myr
@@ -45,6 +45,7 @@
 	type mfdflags	= uint32
 	type aiocontext	= uint64
 	type msg	= void#
+	type arch_prctlop	= uint64
 	
 	
 	type clock = union
@@ -590,6 +591,12 @@
 	
 	/* return value for a failed mapping */
 	const Mapbad	: byte# = (-1 : byte#)
+
+	/* arch_prctl ops */
+	const Archsetgs : arch_prctlop = 0x1001
+	const Archsetfs : arch_prctlop = 0x1002
+	const Archgetfs : arch_prctlop = 0x1003
+	const Archgetgs : arch_prctlop = 0x1004
 	
 	/* signal flags */
 	const Sanocldstop	: sigflags = 0x00000001
@@ -1097,6 +1104,7 @@
 	const Sysmq_notify		: scno = 244
 	const Sysmq_getsetattr		: scno = 245
 	const Sysprctl			: scno = 157
+	const Sysarch_prctl		: scno = 158
 	const Sysswapon			: scno = 167
 	const Sysswapoff		: scno = 168
 	const Sys_sysctl		: scno = 156
@@ -1308,7 +1316,7 @@
 	const settimeofday		:  (tv : timeval#, tz : timezone# -> int64)
 	const adjtimex			:  (txc_p : timex# -> int64)
 	const times			:  (tbuf : tms# -> int64)
-	const gettid			:  ( -> int64)
+	const gettid			:  ( -> pid)
 	const alarm			:  (seconds : uint -> int64)
 	const getppid			:  ( -> int64)
 	const geteuid			:  ( -> int64)
@@ -1484,6 +1492,7 @@
 	const mq_notify			:  (mqdes : int, notification : sigevent# -> int64)
 	const mq_getsetattr		:  (mqdes : int, mqstat : mq_attr#, omqstat : mq_attr# -> int64)
 	const prctl			:  (option : int, arg2 : uint64, arg3 : uint64, arg4 : uint64, arg5 : uint64 -> int64)
+	const arch_prctl		:  (op : arch_prctlop, addr : void# -> int64)
 	const swapon			:  (specialfile : byte#, swap_flags : int -> int64)
 	const swapoff			:  (specialfile : byte# -> int64)
 	const _sysctl			:  (args : sysctl_args# -> int64)
@@ -1782,7 +1791,7 @@
 	 -> (syscall(Systimes, a(tbuf)) : int64)
 }
 const gettid	= {
-	 -> (syscall(Sysgettid) : int64)
+	 -> (syscall(Sysgettid) : pid)
 }
 const alarm	= {seconds
 	 -> (syscall(Sysalarm, a(seconds)) : int64)
@@ -2308,6 +2317,9 @@
 }
 const prctl	= {option, arg2, arg3, arg4, arg5
 	 -> (syscall(Sysprctl, a(option), a(arg2), a(arg3), a(arg4), a(arg5)) : int64)
+}
+const arch_prctl	= {op, addr
+	 -> syscall(Sysarch_prctl, op, addr)
 }
 const swapon	= {specialfile, swap_flags
 	 -> (syscall(Sysswapon, a(specialfile), a(swap_flags)) : int64)
--- a/lib/sys/sys+netbsd-x64.myr
+++ b/lib/sys/sys+netbsd-x64.myr
@@ -18,6 +18,7 @@
 	type umtxop	= int32
 	type signo	= int32
 	type sigflags	= int32
+	type sysarchop	= int64
 
 	type clock = union
 		`Clockrealtime
@@ -344,6 +345,21 @@
 	const Umtxmtxwake2	: umtxop = 22
 	const Umtxmax	: umtxop = 23
 
+	/* sysarch ops */
+	const X8664getldt    : sysarchop = 0
+	const X8664setldt    : sysarchop = 1
+	const X8664iopl      : sysarchop = 2
+	const X8664getioperm : sysarchop = 3
+	const X8664setioperm : sysarchop = 4
+	const X8664oldvm86   : sysarchop = 5
+	const X8664getmtrr   : sysarchop = 11
+	const X8664setmtrr   : sysarchop = 12
+	const X8664vm86      : sysarchop = 13
+	const X8664getgsbase : sysarchop = 14
+	const X8664getfsbase : sysarchop = 15
+	const X8664setgsbase : sysarchop = 16
+	const X8664setfsbase : sysarchop = 17
+
 	/* signal actions */
 	const Saonstack		: sigflags = 0x0001	/* take signal on signal stack */
 	const Sarestart		: sigflags = 0x0002	/* restart system call on signal return */
@@ -908,6 +924,9 @@
 		new : void#, newsz : size# \
 		-> int)
 
+	/* misc */
+	const sysarch	: (op : sysarchop, args : void## -> int)
+
 	extern const cstring	: (str : byte[:] -> byte#)
 	/* filled by start code */
 	extern var __cenvp : byte##
@@ -1100,6 +1119,10 @@
 	/* all args already passed through a() or ar  ptrs */
 	-> (syscall(Sys__sysctl, \
 		(mib : int#), a(mib.len), old, oldsz, new, newsz) : int)
+}
+
+const sysarch = {op, args
+	-> (syscall(Syssysarch, op, args) : int)
 }
 
 const clockid = {clk
--- a/lib/sys/sys+openbsd-x64.myr
+++ b/lib/sys/sys+openbsd-x64.myr
@@ -215,7 +215,7 @@
 	const Mfixed	: mopt = 0x10
 	const Mfile	: mopt = 0x0
 	const Manon	: mopt = 0x1000
-	const Mstack	: mopt = 0x4000
+	const Mstack	: mopt = 0x0
 	const Mnoreplace	: mopt = 0x0800
 
 	/* file types */
--- a/lib/sys/sys+openbsd:6.1-x64.myr
+++ b/lib/sys/sys+openbsd:6.1-x64.myr
@@ -1031,7 +1031,7 @@
 	const symlinkat			:  (path : byte#, fd : int, link : byte# -> int)
 	const unlinkat			:  (fd : int, path : byte#, flag : int -> int)
 	const __set_tcb			:  (tcb : void# -> void)
-	const __get_tcb			:  ( -> void)
+	const __get_tcb			:  ( -> void#)
 ;;
 
 	/* start manual overrides { */
@@ -1750,5 +1750,5 @@
 	 -> (syscall(Sys__set_tcb, a(tcb)) : void)
 }
 const __get_tcb	= {
-	 -> (syscall(Sys__get_tcb) : void)
+	 -> (syscall(Sys__get_tcb) : void#)
 }
--- a/lib/sys/sys+openbsd:6.2-x64.myr
+++ b/lib/sys/sys+openbsd:6.2-x64.myr
@@ -348,7 +348,7 @@
 	const Mfixed	: mopt = 0x10
 	const Mfile	: mopt = 0x0
 	const Manon	: mopt = 0x1000
-	const Mstack	: mopt = 0x4000
+	const Mstack	: mopt = 0x0
 	const Mnoreplace	: mopt = 0x0800
 	
 	/* file types */
@@ -1037,7 +1037,7 @@
 	const symlinkat			:  (path : byte#, fd : int, link : byte# -> int)
 	const unlinkat			:  (fd : int, path : byte#, flag : int -> int)
 	const __set_tcb			:  (tcb : void# -> void)
-	const __get_tcb			:  ( -> void)
+	const __get_tcb			:  ( -> void#)
 ;;
 
 	/* start manual overrides { */
@@ -1759,5 +1759,5 @@
 	 -> (syscall(Sys__set_tcb, a(tcb)) : void)
 }
 const __get_tcb	= {
-	 -> (syscall(Sys__get_tcb) : void)
+	 -> (syscall(Sys__get_tcb) : void#)
 }
--- a/lib/sys/sys+openbsd:6.3-x64.myr
+++ b/lib/sys/sys+openbsd:6.3-x64.myr
@@ -1036,7 +1036,7 @@
 	const symlinkat			:  (path : byte#, fd : int, link : byte# -> int)
 	const unlinkat			:  (fd : int, path : byte#, flag : int -> int)
 	const __set_tcb			:  (tcb : void# -> void)
-	const __get_tcb			:  ( -> void)
+	const __get_tcb			:  ( -> void#)
 ;;
 
 	/* start manual overrides { */
@@ -1755,5 +1755,5 @@
 	 -> (syscall(Sys__set_tcb, a(tcb)) : void)
 }
 const __get_tcb	= {
-	 -> (syscall(Sys__get_tcb) : void)
+	 -> (syscall(Sys__get_tcb) : void#)
 }
--- a/lib/thread/bld.sub
+++ b/lib/thread/bld.sub
@@ -14,6 +14,11 @@
 	sem.myr
 	waitgrp.myr
 
+	# fsbase-based impls
+	tls+fsbase.myr
+	tls-impl+fsbase-x64.s
+	types+fsbase.myr
+
 	# futex-based impls
 	mutex+futex.myr
 	rwlock+futex.myr
@@ -23,6 +28,7 @@
 	# linux impl of basic thread primitives
 	condvar+linux.myr
 	exit+linux-x64.s
+	fsbase+linux.myr
 	futex+linux.myr
 	ncpu+linux.myr
 	spawn+linux.myr
@@ -30,6 +36,7 @@
 	# freebsd impl of thread primitives
 	condvar+freebsd.myr
 	exit+freebsd-x64.s
+	fsbase+freebsd.myr
 	futex+freebsd.myr
 	ncpu+freebsd.myr
 	spawn+freebsd.myr
@@ -37,6 +44,7 @@
 	# netbsd impl of thread primitives
 	#condvar+netbsd.myr
 	#mutex+netbsd.myr
+	fsbase+netbsd.myr
 	spawn+netbsd.myr
 	#ncpu+netbsd.myr
 	#exit+netbsd-x64.s
@@ -46,6 +54,9 @@
 	futex+osx.myr
 	spawn+osx.myr
 	start+osx-x64.s
+	tls+osx.myr
+	tls-impl+osx-x64.s
+	types+osx.myr
 
 	# 9front impl of thread primitives
 	#condvar+plan9.myr
@@ -58,6 +69,7 @@
 	# openbsd impl of thread primitives
 	condvar+openbsd:6.2.myr
 	exit+openbsd-x64.s
+	fsbase+openbsd.myr
 	futex+openbsd:6.2.myr
 	ncpu+openbsd.myr
 	spawn+openbsd.myr
--- a/lib/thread/common.myr
+++ b/lib/thread/common.myr
@@ -1,5 +1,3 @@
-use std
-
-pkg thread = 
+pkg thread =
 	pkglocal generic Zptr : @a#  = (0 : @a#)
 ;;
--- a/lib/thread/exit+freebsd-x64.s
+++ b/lib/thread/exit+freebsd-x64.s
@@ -1,19 +1,12 @@
 /*
 const thread.exit	: (stacksz : std.size -> void)
-NOTE: must be called from the bottom of the stack, since
-we assume that %rbp is in the top 4k of the stack.
 */
 .globl thread$exit
 thread$exit:
-	/* find top of stack */
-	movq	%rbp,%rdi	/* addr */
-	andq	$~0xfff,%rdi	/* align it */
-	addq	$0x1000,%rdi
-
 	/* munmap(base, size) */
 	movq	$73,%rax	/* munmap */
-	movq	-8(%rdi),%rsi	/* size */
-	subq	%rsi,%rdi	/* move to base ptr */
+	movq	%fs:0x08,%rdi	/* base */
+	movq	%fs:0x10,%rsi	/* stksz */
 	syscall
 
 	/* thr_exit(null) */
--- a/lib/thread/exit+linux-x64.s
+++ b/lib/thread/exit+linux-x64.s
@@ -1,19 +1,12 @@
 /*
-const thread.exit	: (stacksz : std.size -> void)
-NOTE: must be called from the bottom of the stack, since
-we assume that %rbp is in the top 4k of the stack.
+const thread.exit : (-> void)
 */
 .globl thread$exit
 thread$exit:
-	/* find top of stack */
-	movq	%rbp,%rdi	/* addr */
-	andq	$~0xfff,%rdi	/* align it */
-	addq	$0x1000,%rdi
-
 	/* munmap(base, size) */
 	movq	$11,%rax	/* munmap */
-	movq	-8(%rdi),%rsi	/* size */
-	subq	%rsi,%rdi	/* move to base ptr */
+	movq	%fs:0x08,%rdi	/* base */
+	movq	%fs:0x10,%rsi	/* stksz */
 	syscall
 
 	/* thread_exit(0) */
--- a/lib/thread/exit+openbsd-x64.s
+++ b/lib/thread/exit+openbsd-x64.s
@@ -1,15 +1,8 @@
 /*
 const thread.exit	: (stacksz : std.size -> void)
-NOTE: must be called from the bottom of the stack, since
-we assume that %rbp is in the top 4k of the stack.
 */
 .globl thread$exit
 thread$exit:
-	/* find top of stack */
-	movq	%rbp,%rdi	/* addr */
-	andq	$~0xfff,%rdi	/* align it */
-	addq	$0x1000,%rdi
-
 	/* 
 	  Because OpenBSD wants a valid stack whenever
 	  we enter the kernel, we need to toss a preallocated
@@ -19,8 +12,8 @@
 
 	/* munmap(base, size) */
 	movq	$73,%rax	/* munmap */
-	movq	-8(%rdi),%rsi	/* size */
-	subq	%rsi,%rdi	/* move to base ptr */
+	movq	%fs:0x08,%rdi	/* base */
+	movq	%fs:0x10,%rsi	/* stksz */
 	syscall
 
 	/* __threxit(0) */
--- /dev/null
+++ b/lib/thread/fsbase+freebsd.myr
@@ -1,0 +1,28 @@
+use std
+use sys
+
+use "types"
+
+pkg thread =
+	pkglocal const setfsbase : (h : tlshdr# -> void)
+	pkglocal const getfsbase : (-> tlshdr#)
+;;
+
+const setfsbase = {h
+	match sys.sysarch(sys.Archamd64setfs, &(h : void#))
+	| 0:
+	| err:
+		std.fput(std.Err, "error: sysarch returned {}\n", err)
+		std.suicide()
+	;;
+}
+
+const getfsbase = {
+	var h
+	match sys.sysarch(sys.Archamd64getfs, &h)
+	| 0: -> (h : tlshdr#)
+	| err:
+		std.fput(std.Err, "error: sysarch returned {}\n", err)
+		std.suicide()
+	;;
+}
--- /dev/null
+++ b/lib/thread/fsbase+linux.myr
@@ -1,0 +1,28 @@
+use std
+use sys
+
+use "types"
+
+pkg thread =
+	pkglocal const setfsbase : (h : tlshdr# -> void)
+	pkglocal const getfsbase : (-> tlshdr#)
+;;
+
+const setfsbase = {h
+	match sys.arch_prctl(sys.Archsetfs, (h : void#))
+	| 0:
+	| err:
+		std.fput(std.Err, "error: arch_prctl returned {}\n", err)
+		std.suicide()
+	;;
+}
+
+const getfsbase = {
+	var h : tlshdr#
+	match sys.arch_prctl(sys.Archgetfs, (&h : void#))
+	| 0: -> h
+	| err:
+		std.fput(std.Err, "error: arch_prctl returned {}\n", err)
+		std.suicide()
+	;;
+}
--- /dev/null
+++ b/lib/thread/fsbase+netbsd.myr
@@ -1,0 +1,28 @@
+use std
+use sys
+
+use "types"
+
+pkg thread =
+	pkglocal const setfsbase : (h : tlshdr# -> void)
+	pkglocal const getfsbase : (-> tlshdr#)
+;;
+
+const setfsbase = {h
+	match sys.sysarch(sys.X8664setfsbase, &(h : void#))
+	| 0:
+	| err:
+		std.fput(std.Err, "error: sysarch returned: {}\n", err)
+		std.suicide()
+	;;
+}
+
+const getfsbase = {
+	var h
+	match sys.sysarch(sys.X8664getfsbase, &h)
+	| 0: -> (h : tlshdr#)
+	| err:
+		std.fput(std.Err, "error: sysarch returned: {}\n", err)
+		std.suicide()
+	;;
+}
--- /dev/null
+++ b/lib/thread/fsbase+openbsd.myr
@@ -1,0 +1,16 @@
+use sys
+
+use "types"
+
+pkg thread =
+	pkglocal const setfsbase : (h : tlshdr# -> void)
+	pkglocal const getfsbase : (-> tlshdr#)
+;;
+
+const setfsbase = {h
+	sys.__set_tcb((h : void#))
+}
+
+const getfsbase = {
+	-> (sys.__get_tcb() : tlshdr#)
+}
--- a/lib/thread/mutex+futex.myr
+++ b/lib/thread/mutex+futex.myr
@@ -1,9 +1,14 @@
+use std
+
 use "atomic"
 use "futex"
+use "tls"
+use "types"
 
 pkg thread =
 	type mutex = struct
 		_state	: ftxtag
+		_owner	: tid
 	;;	
 
 	const mkmtx	: (-> mutex)
@@ -21,12 +26,19 @@
 var nspin = 10	/* FIXME: pick a sane number, based on CPU count */
 
 const mkmtx = {
-	-> [._state = Unlocked]
+	-> [._state = Unlocked, ._owner = -1]
 }
 
 const mtxlock = {mtx
 	var c
 
+	if mtx._owner == tid()
+		std.fput(std.Err,
+			"error: thread {} attempted to relock a mutex it already holds\n",
+			tid())
+		std.suicide()
+	;;
+
 	/*
 	Uncontended case: we get an unlocked mutex, and we lock it.
 	*/
@@ -34,6 +46,7 @@
 	for var i = 0; i < nspin; i++
 		c = xcas(&mtx._state, Unlocked, Locked)
 		if c == Unlocked
+			mtx._owner = tid()
 			-> void
 		;;
 	;;
@@ -51,14 +64,32 @@
 		ftxwait(&mtx._state, Contended, -1)
 		c = xchg(&mtx._state, Contended)
 	;;
+	mtx._owner = tid()
 }
 
 const mtxtrylock = {mtx
-	-> xcas(&mtx._state, Unlocked, Locked) == Unlocked
+	if xcas(&mtx._state, Unlocked, Locked) == Unlocked
+		mtx._owner = tid()
+		-> true
+	;;
+	-> false
 }
 
 const mtxunlock = {mtx
 	/*
+	Nonatomically loading mtx._owner may produce false negatives on
+	weakly-ordered architectures but having to atomically store and load
+	mtx._owner doesn't seem worth it.
+	*/
+	if mtx._owner != tid()
+		std.fput(std.Err,
+			"error: thread {} attempted to unlock a mutex last held by {}\n",
+			tid(), mtx._owner)
+		std.suicide()
+	;;
+	mtx._owner = -1
+
+	/*
 	Either the lock is contended or it's uncontended. Any other
 	state is a bug.
 
@@ -72,7 +103,15 @@
 }
 
 const mtxcontended = {mtx
+	if mtx._owner == tid()
+		std.fput(std.Err,
+			"error: thread {} attempted to relock a mutex it already holds\n",
+			tid())
+		std.suicide()
+	;;
+
 	while xchg(&mtx._state, Contended) != Unlocked
 		ftxwait(&mtx._state, Contended, -1)
 	;;
+	mtx._owner = tid()
 }
--- a/lib/thread/mutex.myr
+++ b/lib/thread/mutex.myr
@@ -1,5 +1,4 @@
 use std
-use sys
 
 use "atomic"
 
--- a/lib/thread/rwlock+futex.myr
+++ b/lib/thread/rwlock+futex.myr
@@ -2,6 +2,8 @@
 
 use "atomic"
 use "futex"
+use "tls"
+use "types"
 
 pkg thread =
 	/*
@@ -13,6 +15,7 @@
 	*/
 	type rwlock = struct
 		_state : ftxtag
+		_owner : tid
 	;;
 
 	const mkrwlock  : (-> rwlock)
@@ -28,7 +31,7 @@
 const Waitbit = 0x80000000
 
 const mkrwlock = {
-	-> [._state = 0]
+	-> [._state = 0, ._owner = -1]
 }
 
 const rdlock = {rw
@@ -61,6 +64,13 @@
 
 const wrlock = {rw
 	for ; ;
+		if rw._owner == tid()
+			std.fput(std.Err,
+				"error: thread {} attempted to relock an rwlock it already holds\n",
+				tid())
+			std.suicide()
+		;;
+
 		/*
 		`_state` must be 0 for a writer to acquire the lock. Anything
 		else means the lock is either held or in the process of being
@@ -68,6 +78,7 @@
 		 */
 		var s = xcas(&rw._state, 0, Nrmask)
 		if s == 0
+			rw._owner = tid()
 			-> void
 		;;
 
@@ -98,7 +109,11 @@
 }
 
 const trywrlock = {rw
-	-> xcas(&rw._state, 0, Nrmask) == 0
+	if xcas(&rw._state, 0, Nrmask) == 0
+		rw._owner = tid()
+		-> true
+	;;
+	-> false
 }
 
 const rdunlock = {rw
@@ -122,6 +137,14 @@
 }
 
 const wrunlock = {rw
+	if rw._owner != tid()
+		std.fput(std.Err,
+			"error: thread {} attempted to unlock an rwlock last held by {}\n",
+			tid(), rw._owner)
+		std.suicide()
+	;;
+	rw._owner = -1
+
 	/*
 	If the wait bit was set then there are one or more waiting readers,
 	writers, or both. In the first and third cases, we need to wake
--- a/lib/thread/spawn+freebsd.myr
+++ b/lib/thread/spawn+freebsd.myr
@@ -1,9 +1,12 @@
 use sys
 use std
 
-pkg thread =
-	type tid = uint64
+use "common"
+use "fsbase"
+use "tls"
+use "types"
 
+pkg thread =
 	const spawn : (fn : (-> void) -> std.result(tid, byte[:]))
 ;;
 
@@ -16,60 +19,65 @@
 }
 
 const spawnstk = {fn, sz
-	var stk : byte#, tid, ctid, ret
-	var szp, f, tos, env, envsz
+	var stk, tos, stksz, hdr, tid = -1, ret
 
-	stk = getstk(sz)
+	stk = sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
 	if stk == sys.Mapbad
 		-> `std.Err "couldn't get stack"
 	;;
-	tid = -1
-	/* find top of stack */
-	tos = (stk : std.intptr) + (sz : std.intptr)
+	(tos, stksz, hdr) = initstk(stk, fn, sz)
 
-	/* store the stack size */
-	tos -= sizeof(sys.size)
-	sz -= sizeof(sys.size)
-	szp = (tos : sys.size#)
-	szp# = Stacksz
-
-	/* store the function we call */
-	envsz = std.fnenvsz(fn)
-	tos -= (envsz : std.intptr)
-	sz -= (envsz : sys.size)
-	env = tos
-	tos -= sizeof((->void))
-	sz -= sizeof((->void))
-	f = (tos : (->void)#)
-	f# = std.fnbdup(fn, (env : byte#)[:envsz])
-	var repr = (&fn : int64[2]#)#
-
 	ret = sys.thr_new(&[
 		.startfn = (startthread : void#),
 		.arg = (tos : void#),
 		.stkbase = (stk : byte#),
-		.stksz = sz,
-		.tid = &ctid,
+		.stksz = stksz,
+		.tid = (&hdr.tid : uint64#),
 		.ptid = &tid,
 		.flags = 2,
-		.rtp = (0 : sys.rtprio#)
+		.rtp = Zptr,
 	], sizeof(sys.thrparam))
 
 	if ret < 0
+		sys.munmap(stk, sz)
 		-> `std.Err "couldn't spawn thread"
 	;;
 	-> `std.Ok (tid : tid)
 }
 
-const getstk = {sz
-	var p, m
+const initstk = {stk, fn, sz
+	var stksz, len, tos, hdr, fp, env, envsz
 
-	p = sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
-	if p == sys.Mapbad
-		-> p
-	;;
-	m = (p : std.intptr)
-	-> (m : byte#)
+	stksz = sz
+	len = tlslen()
+	stksz -= (sizeof(tlshdr) + ((len : sys.size) * sizeof(void#)) + 0xf) & ~0xf
+	tos = (stk : std.intptr) + (stksz : std.intptr)
+	hdr = (tos : tlshdr#)
+	hdr.base = stk
+	hdr.stksz = sz
+	fn = std.fndup(fn)
+
+	var fn1 = {
+		/*
+		We write `hdr.len` here because it follows `hdr.tid` so it gets
+		overwritten by the kernel in `thr_new`. Even though `sys.pid`
+		is 32 bits, `thr_param.tid` is a `uint64#` for legacy reasons.
+		*/
+		hdr.len = len
+		setfsbase(hdr)
+		fn()
+		std.fnfree(fn)
+	}
+
+	envsz = std.fnenvsz(fn1)
+	tos -= (envsz : std.intptr)
+	stksz -= (envsz : sys.size)
+	env = tos
+	tos -= sizeof((->void))
+	stksz -= sizeof((->void))
+	fp = (tos : (->void)#)
+	fp# = std.fnbdup(fn1, (env : byte#)[:envsz])
+	-> ((tos : byte#), stksz, hdr)
 }
 
 const startthread = {f : (-> void)#
--- a/lib/thread/spawn+linux.myr
+++ b/lib/thread/spawn+linux.myr
@@ -1,72 +1,67 @@
 use sys
 use std
 
-pkg thread =
-	type tid = sys.pid
+use "common"
+use "tls"
+use "types"
 
+pkg thread =
 	const spawn : (fn : (-> void) -> std.result(tid, byte[:]))
 ;;
 
+const Stacksz = 8*std.MiB
 extern const exit : (-> void)
 
 /* Holy shit flag mania. */
-const Thrflag = sys.Clonevm | sys.Clonefs | sys.Clonefiles  | \
-	sys.Clonesighand | sys.Clonethread |sys.Clonesysvsem | \
-	sys.Clonesettls | sys.Cloneparentsettid | sys.Clonechildcleartid
+const Thrflag = sys.Clonevm | sys.Clonefs | sys.Clonefiles | \
+	sys.Clonesighand | sys.Clonethread | sys.Clonesettls | \
+	sys.Clonechildsettid
 
-const Stacksz = 8*std.MiB
-
 const spawn = {fn
 	-> spawnstk(fn, Stacksz)
 }
 
 const spawnstk = {fn, sz
-	var stk : byte#, tid, ctid, ret
+	var stk, tos, hdr, ret
 
-	stk = getstk(sz)
+	stk = sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
 	if stk == sys.Mapbad
 		-> `std.Err "couldn't get stack"
 	;;
-	stk = initstack(stk, fn, Stacksz)
+	(tos, hdr) = initstk(stk, fn, sz)
 
-	ret = sys.fnclone(Thrflag, \
-		(stk : byte#),\
-		&tid, (0 : byte#), \
-		&ctid, (0 : byte#), \
+	ret = sys.fnclone(Thrflag,
+		tos,
+		Zptr,
+		(hdr : byte#),
+		(&hdr.tid : sys.pid#),
+		Zptr,
 		(startthread : void#))
 	if ret < 0
+		sys.munmap(stk, sz)
 		-> `std.Err "couldn't spawn thread"
 	;;
 	-> `std.Ok (ret : tid)
 }
 
-const initstack = {stk, fn, sz
-	var tos, szp, fp, env, envsz
+const initstk = {stk, fn, sz
+	var len, tos, hdr, fp, env, envsz
 
+	len = tlslen()
+	tos = (stk : std.intptr) + (sz : std.intptr)
+	tos -= (sizeof(tlshdr) + ((len : std.intptr) * sizeof(void#)) + 0xf) & ~0xf
+	hdr = (tos : tlshdr#)
+	hdr.len = len
+	hdr.base = stk
+	hdr.stksz = sz
+
 	envsz = std.fnenvsz(fn)
-	tos = (stk : std.intptr)
-	tos -= sizeof(int64)
-	szp = (tos : sys.size#)
-	szp# = sz
 	tos -= (envsz : std.intptr)
 	env = tos
 	tos -= sizeof((->void))
 	fp = (tos : (->void)#)
 	fp# = std.fnbdup(fn, (env : byte#)[:envsz])
-	-> (tos : byte#)
-}
-
-const getstk = {sz
-	var p, m
-
-	p = sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
-	if p == sys.Mapbad
-		-> p
-	;;
-	/* stack starts at the top of memory and grows down. */
-	m = (p : std.intptr)
-	m += (sz : std.intptr)
-	-> (m : byte#)
+	-> ((tos : byte#), hdr)
 }
 
 const startthread = {fn : (-> void)
--- a/lib/thread/spawn+openbsd.myr
+++ b/lib/thread/spawn+openbsd.myr
@@ -1,9 +1,11 @@
 use std
 use sys
 
-pkg thread =
-	type tid = uint64
+use "common"
+use "tls"
+use "types"
 
+pkg thread =
 	const spawn : (fn : (-> void) -> std.result(tid, byte[:]))
 	pkglocal var exitstk : byte#
 ;;
@@ -18,6 +20,7 @@
 	  time to swap to before we invalidate a stack.
 	 */
 	exitstk = getstk(16)
+	std.assert(exitstk != sys.Mapbad, "error: failed to mmap exitstk\n")
 }
 
 const spawn = {fn;
@@ -25,30 +28,17 @@
 }
 
 const spawnstk = {fn, sz
-	var stk, szp, fp, tos, tfp, env, envsz
-	var ret
+	var stk, tos, hdr, tfp, ret
 
 	stk = getstk(sz)
 	if stk == sys.Mapbad
 		-> `std.Err "couldn't get stack"
 	;;
-	/* store size */
-	tos = (stk : std.intptr)
-	tos -= sizeof(int64)
-	szp = (tos : sys.size#)
-	szp# = Stacksz
+	(tos, hdr) = initstk(stk, fn, sz)
 
-	/* store func */
-	envsz = std.fnenvsz(fn)
-	tos -= (envsz : std.intptr)
-	env = tos
-	tos -= sizeof((->void))
-	fp = (tos : (->void)#)
-	fp# = std.fnbdup(fn, (env : byte#)[:envsz])
-
 	tfp = [
-		.tcb = (0 : void#),
-		.tid = &ret,
+		.tcb = (hdr : void#),
+		.tid = (&hdr.tid : sys.pid#),
 		.stk = (tos : byte#),
 	]
 	ret = sys.__tfork_thread(&tfp,
@@ -56,22 +46,34 @@
 		(startthread : void#),
 		(0 : void#))
 	if ret < 0
+		sys.munmap(stk, sz)
 		-> `std.Err "couldn't spawn thread"
 	;;
 	-> `std.Ok (ret : tid)
 }
 
-const getstk = {sz
-	var p, m
+const initstk = {stk, fn, sz
+	var len, tos, hdr, fp, env, envsz
 
-	p = sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon | sys.Mstack, -1, 0)
-	if p == sys.Mapbad
-		-> p
-	;;
-	/* stack starts at the top of memory and grows down. */
-	m = (p : std.intptr)
-	m += (sz : std.intptr)
-	-> (m : byte#)
+	len = tlslen()
+	tos = (stk : std.intptr) + (sz : std.intptr)
+	tos -= (sizeof(tlshdr) + ((len : std.intptr) * sizeof(void#)) + 0xf) & ~0xf
+	hdr = (tos : tlshdr#)
+	hdr.len = len
+	hdr.base = stk
+	hdr.stksz = sz
+
+	envsz = std.fnenvsz(fn)
+	tos -= (envsz : std.intptr)
+	env = tos
+	tos -= sizeof((->void))
+	fp = (tos : (->void)#)
+	fp# = std.fnbdup(fn, (env : byte#)[:envsz])
+	-> ((tos : byte#), hdr)
+}
+
+const getstk = {sz
+	-> sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
 }
 
 const startthread = {fn : (-> void)
--- a/lib/thread/spawn+osx.myr
+++ b/lib/thread/spawn+osx.myr
@@ -1,9 +1,10 @@
 use sys
 use std
 
-pkg thread =
-	type tid = uint64
+use "tls"
+use "types"
 
+pkg thread =
 	const spawn : (fn : (-> void) -> std.result(tid, byte[:]))
 ;;
 
@@ -34,35 +35,14 @@
 }
 
 const spawnstk = {fn, sz
-	var stk : byte#, tid, ret
-	var szp, f, tos, env, envsz
+	var stk, tos, ret
 
-	stk = getstk(sz)
+	stk = sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
 	if stk == sys.Mapbad
 		-> `std.Err "couldn't get stack"
 	;;
-	tid = -1
+	tos = initstk(stk, fn, sz)
 
-	/* find top of stack */
-	tos = (stk : std.intptr) + (sz : std.intptr)
-
-	/* store the stack size */
-	tos -= sizeof(sys.size)
-	sz -= sizeof(sys.size)
-	szp = (tos : sys.size#)
-	szp# = Stacksz
-
-	/* store the function we call */
-	envsz = std.fnenvsz(fn)
-	tos -= (envsz : std.intptr)
-	sz -= (envsz : sys.size)
-	env = tos
-	tos -= sizeof((->void))
-	sz -= sizeof((->void))
-	f = (tos : (->void)#)
-	f# = std.fnbdup(fn, (env : byte#)[:envsz])
-	var repr = (&fn : int64[2]#)#
-
 	ret = sys.bsdthread_create( \
 		(tramp	: void#), \	/* start */
 		(tos	: void#), \		/* arg */
@@ -70,21 +50,39 @@
 		(0	: void#), \		/* pthread struct */
 		0x01000000)			/* flags (PTHREAD_START_CUSTOM): don't alloc stack in kernel */
 
-	if ret == (-1 : void#)
+	if (ret : std.size) < 0
+		sys.munmap(stk, sz)
 		-> `std.Err "couldn't spawn thread"
 	;;
-	-> `std.Ok (ret : tid)
+	-> `std.Ok (stk : tid)
 }
 
-const getstk = {sz
-	var p, m
+const initstk = {stk, fn, sz
+	var len, tos, hdr, fp, env, envsz
 
-	p = sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
-	if p == sys.Mapbad
-		-> p
-	;;
-	m = (p : std.intptr)
-	-> (m : byte#)
+	len = tlslen()
+	tos = (stk : std.intptr) + (sz : std.intptr)
+	tos -= (sizeof(tlshdr) + ((len : std.intptr) * sizeof(void#)) + 0xf) & ~0xf
+	hdr = (tos : tlshdr#)
+	hdr.tid = (stk : tid)
+	hdr.len = len
+	hdr.base = stk
+	hdr.stksz = sz
+	fn = std.fndup(fn)
+
+	var fn1 = {
+		setgsbase(hdr)
+		fn()
+		std.fnfree(fn)
+	}
+
+	envsz = std.fnenvsz(fn1)
+	tos -= (envsz : std.intptr)
+	env = tos
+	tos -= sizeof((->void))
+	fp = (tos : (->void)#)
+	fp# = std.fnbdup(fn1, (env : byte#)[:envsz])
+	-> (tos : byte#)
 }
 
 /*
--- a/lib/thread/start+osx-x64.s
+++ b/lib/thread/start+osx-x64.s
@@ -15,20 +15,13 @@
 	
 /*
 const thread.exit	: (stacksz : std.size -> void)
-NOTE: must be called from the bottom of the stack, since
-we assume that %rbp is in the top 4k of the stack.
 */
 .globl _thread$exit
 _thread$exit:
-	/* find top of stack */
-	movq	%rbp,%rdi	/* addr */
-	andq	$~0xfff,%rdi	/* align it */
-	addq	$0x1000,%rdi
-
 	/* munmap(base, size) */
 	movq	$0x2000049,%rax	/* munmap */
-	movq	-8(%rdi),%rsi	/* size */
-	subq	%rsi,%rdi	/* move to base ptr */
+	movq	%gs:0x08,%rdi	/* base */
+	movq	%gs:0x10,%rsi	/* stksz */
 	syscall
 
 	/* exit the thread */
--- /dev/null
+++ b/lib/thread/test/die.myr
@@ -1,0 +1,8 @@
+use thread
+
+const main = {
+	var m = thread.mkmtx()
+	thread.mtxlock(&m)
+	thread.mtxunlock(&m)
+	thread.mtxunlock(&m)
+}
--- /dev/null
+++ b/lib/thread/test/tls.myr
@@ -1,0 +1,49 @@
+use std
+use sys
+use thread
+
+const Nelt = 100
+const Nthr = 100
+
+var elts : thread.tid[Nelt]
+var start
+var wg
+
+const setget = {
+	var tid = thread.tid()
+	var localelts : thread.tid[Nelt]
+	for var i = 0; i < Nelt; i++
+		localelts[i] = elts[i] + tid
+	;;
+
+	var k = start
+	for var i = 0; i < Nelt; i++
+		thread.tlsset(k, &localelts[i])
+		k++
+	;;
+	k = start
+	for var i = 0; i < Nelt; i++
+		std.assert(thread.tlsget(k)# == localelts[i], "tls is broken\n")
+		k++
+	;;
+	thread.wgpost(&wg)
+}
+
+const main = {
+	for var i = 0; i < Nelt; i++
+		elts[i] = std.randnum()
+	;;
+
+	start = thread.tlsalloc()
+	for var i = 1; i < Nelt; i++
+		var k : thread.tlskey(thread.tid#) = thread.tlsalloc()
+	;;
+
+	wg = thread.mkwg(Nthr)
+	for var i = 1; i < 100; i++
+		thread.spawn(setget)
+	;;
+	setget()
+
+	thread.wgwait(&wg)
+}
--- /dev/null
+++ b/lib/thread/tls+fsbase.myr
@@ -1,0 +1,59 @@
+use std
+
+use "common"
+use "fsbase"
+use "types"
+
+pkg thread =
+	generic      tlsalloc : (-> tlskey(@a#))
+	generic      tlsset   : (k : tlskey(@a#), v : @a# -> void)
+	generic      tlsget   : (k : tlskey(@a#) -> @a#)
+	extern const tid      : (-> tid)
+
+	pkglocal const        tlsoob : (k : tlskey(void) -> void)
+	pkglocal extern const tlslen : (-> tlskey(void))
+;;
+
+const Staticcap = 8
+
+var _hdr
+var _cap = Staticcap
+
+generic tlsalloc = {
+	std.assert(tid() == 0, "error: tlsalloc must be called from main thread\n")
+	if _hdr == Zptr
+		/* `_hdr` is lazily initialized here since we can't set it in start.s */
+		_hdr = getfsbase()
+	;;
+
+	if _hdr.len++ == _cap
+		std.assert(_cap < 0x8000_0000, "error: max tls slots exceeded\n")
+		var l = sizeof(tlshdr) + ((_cap : std.size) * sizeof(void#))
+		var h = std.bytealloc(sizeof(tlshdr) + ((_cap *= 2 : std.size) * sizeof(void#)))
+
+		std.memblit(h, (_hdr : byte#), l)
+		setfsbase((h : tlshdr#))
+		/* this is ugly... the initial tls region is statically allocated */
+		if _cap != Staticcap * 2
+			std.bytefree((_hdr : byte#), l)
+		;;
+		_hdr = (h : tlshdr#)
+	;;
+	-> (_hdr.len - 1 : tlskey(@a#))
+}
+
+generic tlsset = {k, v
+	_tlsset((k : tlskey(void)), (v : void#))
+}
+
+generic tlsget = {k
+	-> (_tlsget((k : tlskey(void))) : @a#)
+}
+
+const tlsoob = {k
+	std.fput(std.Err, "error: tlskey {} out of bounds {}\n", k, tlslen())
+	std.suicide()
+}
+
+extern const _tlsset : (k : tlskey(void), v : void# -> void)
+extern const _tlsget : (k : tlskey(void) -> void#)
--- /dev/null
+++ b/lib/thread/tls+osx.myr
@@ -1,0 +1,70 @@
+use std
+
+use "common"
+use "types"
+
+pkg thread =
+	generic      tlsalloc : (-> tlskey(@a#))
+	generic      tlsset   : (k : tlskey(@a#), v : @a# -> void)
+	generic      tlsget   : (k : tlskey(@a#) -> @a#)
+	extern const tid      : (-> tid)
+
+	pkglocal const        tlsoob    : (k : tlskey(void) -> void)
+	pkglocal extern const tlslen    : (-> tlskey(void))
+	pkglocal const        setgsbase : (h : tlshdr# -> void)
+	pkglocal extern const getgsbase : (-> tlshdr#)
+;;
+
+const Staticcap = 8
+
+var _hdr
+var _cap = Staticcap
+
+generic tlsalloc = {
+	std.assert(tid() == 0, "error: tlsalloc must be called from main thread\n")
+	if _hdr == Zptr
+		/* `_hdr` is lazily initialized here since we can't set it in start.s */
+		_hdr = getgsbase()
+	;;
+
+	if _hdr.len++ == _cap
+		std.assert(_cap < 0x8000_0000, "error: max tls slots exceeded\n")
+		var l = sizeof(tlshdr) + ((_cap : std.size) * sizeof(void#))
+		var h = std.bytealloc(sizeof(tlshdr) + ((_cap *= 2 : std.size) * sizeof(void#)))
+
+		std.memblit(h, (_hdr : byte#), l)
+		setgsbase((h : tlshdr#))
+		/* this is ugly... the initial tls region is statically allocated */
+		if _cap != Staticcap * 2
+			std.bytefree((_hdr : byte#), l)
+		;;
+		_hdr = (h : tlshdr#)
+	;;
+	-> (_hdr.len - 1 : tlskey(@a#))
+}
+
+generic tlsset = {k, v
+	_tlsset((k : tlskey(void)), (v : void#))
+}
+
+generic tlsget = {k
+	-> (_tlsget((k : tlskey(void))) : @a#)
+}
+
+const tlsoob = {k
+	std.fput(std.Err, "error: tlskey {} out of bounds {}\n", k, tlslen())
+	std.suicide()
+}
+
+const setgsbase = {h
+	match _setgsbase(h)
+	| 0xf: /* yes, this indicates success; no, it's not documented */
+	| err:
+		std.fput(std.Err, "error: setgsbase returned {}\n", err)
+		std.suicide()
+	;;
+}
+
+extern const _tlsset    : (k : tlskey(void), v : void# -> void)
+extern const _tlsget    : (k : tlskey(void) -> void#)
+extern const _setgsbase : (h : tlshdr# -> int64)
--- /dev/null
+++ b/lib/thread/tls-impl+fsbase-x64.s
@@ -1,0 +1,48 @@
+.set tid,	0x00
+.set len,	0x04
+.set slots,	0x18
+
+/* const tid : (-> tid) */
+.globl thread$tid
+.globl _thread$tid
+thread$tid:
+_thread$tid:
+	movl	%fs:tid, %eax
+	ret
+
+/* const _tlsset : (k : key, v : void# -> void) */
+.globl thread$_tlsset
+.globl _thread$_tlsset
+thread$_tlsset:
+_thread$_tlsset:
+	cmpl	%fs:len, %edi
+	jnb	oob
+
+	movslq	%edi, %rdi
+	movq	$slots, %r10
+	movq	%rsi, %fs:(%r10, %rdi, 0x8)
+	ret
+
+/* const _tlsget : (k : key -> void#) */
+.globl thread$_tlsget
+.globl _thread$_tlsget
+thread$_tlsget:
+_thread$_tlsget:
+	cmpl	%fs:len, %edi
+	jnb	oob
+
+	movslq	%edi, %rdi
+	movq	$slots, %r10
+	movq	%fs:(%r10, %rdi, 0x8), %rax
+	ret
+
+oob:
+	call	thread$tlsoob
+
+/* const tlslen : (-> key) */
+.globl thread$tlslen
+.globl _thread$tlslen
+thread$tlslen:
+_thread$tlslen:
+	movl	%fs:len, %eax
+	ret
--- /dev/null
+++ b/lib/thread/tls-impl+osx-x64.s
@@ -1,0 +1,64 @@
+.set tid,	0x00
+.set len,	0x08
+.set self,	0x20
+.set slots,	0x28
+
+/* const tid : (-> tid) */
+.globl thread$tid
+.globl _thread$tid
+thread$tid:
+_thread$tid:
+	movq	%gs:tid, %rax
+	ret
+
+/* const _tlsset : (k : key, v : void# -> void) */
+.globl thread$_tlsset
+.globl _thread$_tlsset
+thread$_tlsset:
+_thread$_tlsset:
+	cmpq	%gs:len, %rdi
+	jnb	oob
+
+	movq	$slots, %r10
+	movq	%rsi, %gs:(%r10, %rdi, 0x8)
+	ret
+
+/* const _tlsget : (k : key -> void#) */
+.globl thread$_tlsget
+.globl _thread$_tlsget
+thread$_tlsget:
+_thread$_tlsget:
+	cmpq	%gs:len, %rdi
+	jnb	oob
+
+	movq	$slots, %r10
+	movq	%gs:(%r10, %rdi, 0x8), %rax
+	ret
+
+oob:
+	call	_thread$tlsoob
+
+/* const tlslen : (-> key) */
+.globl thread$tlslen
+.globl _thread$tlslen
+thread$tlslen:
+_thread$tlslen:
+	movq	%gs:len, %rax
+	ret
+
+/* const _setgsbase : (h : tlshdr# -> int64) */
+.globl thread$_setgsbase
+.globl _thread$_setgsbase
+thread$_setgsbase:
+_thread$_setgsbase:
+	movq	$0x3000003, %rax /* undocumented syscall; sets %gs to %rdi */
+	syscall
+	ret
+
+/* const getgsbase : (-> tlshdr#) */
+.globl thread$getgsbase
+.globl _thread$getgsbase
+thread$getgsbase:
+_thread$getgsbase:
+	movq	%gs:self, %rax
+	ret
--- /dev/null
+++ b/lib/thread/types+fsbase.myr
@@ -1,0 +1,19 @@
+use sys
+
+pkg thread =
+	type tid        = sys.pid /* 32 bits on all of the fsbase platforms */
+	type tlskey(@a) = uint32
+
+	/*
+	XXX: Be sure to update tls-impl+fsbase.s and
+	rt/start-{freebsd,linux,netbsd,openbsd}.s if any changes are made to
+	the size of this struct and/or the offsets of any of its members.
+	 */
+	pkglocal type tlshdr = struct
+		tid   : tid
+		len   : tlskey(void)
+		base  : byte#
+		stksz : sys.size
+		slots : void#[...]
+	;;
+;;
--- /dev/null
+++ b/lib/thread/types+osx.myr
@@ -1,0 +1,20 @@
+use sys
+
+pkg thread =
+	type tid        = sys.pid /* 64 bits */
+	type tlskey(@a) = uint64
+
+	/*
+	XXX: Be sure to update tls-impl+osx.s and rt/start-osx.s if any changes
+	are made to the size of this struct and/or the offsets of any of its
+	members.
+	 */
+	pkglocal type tlshdr = struct
+		tid   : tid
+		len   : tlskey(void)
+		base  : byte#
+		stksz : sys.size
+		self  : tlshdr#
+		slots : void#[...]
+	;;
+;;
--- a/mk/bootstrap/bootstrap+Darwin-x86_64.sh
+++ b/mk/bootstrap/bootstrap+Darwin-x86_64.sh
@@ -7,6 +7,7 @@
 	as -g -o mbld/cpufeatures.o mbld/cpufeatures+posixy-x64.s
 	as -g -o lib/thread/start.o lib/thread/start+osx-x64.s
 	as -g -o lib/thread/atomic-impl.o lib/thread/atomic-impl+x64.s
+	as -g -o lib/thread/tls-impl.o lib/thread/tls-impl+osx-x64.s
 	as -g -o lib/std/getbp.o lib/std/getbp+posixy-x64.s
 	$pwd/6/6m -I lib/sys lib/std/option.myr
 	$pwd/6/6m -I lib/sys lib/std/traits.myr
@@ -120,16 +121,18 @@
 	$pwd/6/6m -I lib/std -I lib/sys lib/bio/puti.myr
 	ar -rcs lib/bio/libbio.a lib/bio/puti.o lib/bio/geti.o lib/bio/fd.o lib/bio/mem.o lib/bio/bio.o lib/bio/types.o lib/bio/iter.o
 	$pwd/muse/muse -o lib/bio/libbio.use -p bio lib/bio/puti.use lib/bio/geti.use lib/bio/fd.use lib/bio/mem.use lib/bio/bio.use lib/bio/types.use lib/bio/iter.use
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+osx.myr
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/common.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/atomic.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/types+osx.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/tls+osx.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+osx.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/futex+osx.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/sem.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/mutex.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/hookstd.myr
-	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/start.o lib/thread/futex.o lib/thread/spawn.o
-	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
+	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/types.o lib/thread/tls.o lib/thread/tls-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/start.o lib/thread/futex.o lib/thread/spawn.o
+	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/types.use lib/thread/tls.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/opts.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/syssel.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/libs.myr
--- a/mk/bootstrap/bootstrap+FreeBSD-amd64.sh
+++ b/mk/bootstrap/bootstrap+FreeBSD-amd64.sh
@@ -7,6 +7,7 @@
 	as -g -o mbld/cpufeatures.o mbld/cpufeatures+posixy-x64.s
 	as -g -o lib/thread/exit.o lib/thread/exit+freebsd-x64.s
 	as -g -o lib/thread/atomic-impl.o lib/thread/atomic-impl+x64.s
+	as -g -o lib/thread/tls-impl.o lib/thread/tls-impl+fsbase-x64.s
 	as -g -o lib/std/getbp.o lib/std/getbp+posixy-x64.s
 	$pwd/6/6m -I lib/sys lib/std/option.myr
 	$pwd/6/6m -I lib/sys lib/std/traits.myr
@@ -120,16 +121,19 @@
 	$pwd/6/6m -I lib/std -I lib/sys lib/bio/puti.myr
 	ar -rcs lib/bio/libbio.a lib/bio/puti.o lib/bio/geti.o lib/bio/fd.o lib/bio/mem.o lib/bio/bio.o lib/bio/types.o lib/bio/iter.o
 	$pwd/muse/muse -o lib/bio/libbio.use -p bio lib/bio/puti.use lib/bio/geti.use lib/bio/fd.use lib/bio/mem.use lib/bio/bio.use lib/bio/types.use lib/bio/iter.use
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+freebsd.myr
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu+freebsd.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/common.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/atomic.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/types+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/fsbase+freebsd.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/tls+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+freebsd.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu+freebsd.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/futex+freebsd.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/sem.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/mutex.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/hookstd.myr
-	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/exit.o lib/thread/futex.o lib/thread/spawn.o
-	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
+	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/types.o lib/thread/fsbase.o lib/thread/tls.o lib/thread/tls-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/exit.o lib/thread/futex.o lib/thread/spawn.o
+	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/types.use lib/thread/fsbase.use lib/thread/tls.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/opts.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/syssel.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/libs.myr
--- a/mk/bootstrap/bootstrap+Linux-x86_64.sh
+++ b/mk/bootstrap/bootstrap+Linux-x86_64.sh
@@ -7,6 +7,7 @@
 	as -g -o mbld/cpufeatures.o mbld/cpufeatures+posixy-x64.s
 	as -g -o lib/thread/exit.o lib/thread/exit+linux-x64.s
 	as -g -o lib/thread/atomic-impl.o lib/thread/atomic-impl+x64.s
+	as -g -o lib/thread/tls-impl.o lib/thread/tls-impl+fsbase-x64.s
 	as -g -o lib/std/getbp.o lib/std/getbp+posixy-x64.s
 	$pwd/6/6m -I lib/sys lib/std/option.myr
 	$pwd/6/6m -I lib/sys lib/std/traits.myr
@@ -120,16 +121,19 @@
 	$pwd/6/6m -I lib/std -I lib/sys lib/bio/puti.myr
 	ar -rcs lib/bio/libbio.a lib/bio/puti.o lib/bio/geti.o lib/bio/fd.o lib/bio/mem.o lib/bio/bio.o lib/bio/types.o lib/bio/iter.o
 	$pwd/muse/muse -o lib/bio/libbio.use -p bio lib/bio/puti.use lib/bio/geti.use lib/bio/fd.use lib/bio/mem.use lib/bio/bio.use lib/bio/types.use lib/bio/iter.use
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+linux.myr
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu+linux.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/common.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/atomic.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/types+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/fsbase+linux.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/tls+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+linux.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu+linux.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/futex+linux.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/sem.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/mutex.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/hookstd.myr
-	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/exit.o lib/thread/futex.o lib/thread/spawn.o
-	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
+	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/types.o lib/thread/fsbase.o lib/thread/tls.o lib/thread/tls-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/exit.o lib/thread/futex.o lib/thread/spawn.o
+	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/types.use lib/thread/fsbase.use lib/thread/tls.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/opts.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/syssel.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/libs.myr
--- a/mk/bootstrap/bootstrap+NetBSD-amd64.sh
+++ b/mk/bootstrap/bootstrap+NetBSD-amd64.sh
@@ -6,6 +6,7 @@
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/config.myr
 	as -g -o mbld/cpufeatures.o mbld/cpufeatures+posixy-x64.s
 	as -g -o lib/thread/atomic-impl.o lib/thread/atomic-impl+x64.s
+	as -g -o lib/thread/tls-impl.o lib/thread/tls-impl+fsbase-x64.s
 	as -g -o lib/std/getbp.o lib/std/getbp+posixy-x64.s
 	$pwd/6/6m -I lib/sys lib/std/option.myr
 	$pwd/6/6m -I lib/sys lib/std/traits.myr
@@ -119,15 +120,18 @@
 	$pwd/6/6m -I lib/std -I lib/sys lib/bio/puti.myr
 	ar -rcs lib/bio/libbio.a lib/bio/puti.o lib/bio/geti.o lib/bio/fd.o lib/bio/mem.o lib/bio/bio.o lib/bio/types.o lib/bio/iter.o
 	$pwd/muse/muse -o lib/bio/libbio.use -p bio lib/bio/puti.use lib/bio/geti.use lib/bio/fd.use lib/bio/mem.use lib/bio/bio.use lib/bio/types.use lib/bio/iter.use
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+netbsd.myr
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/common.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/atomic.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/types+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/fsbase+netbsd.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/tls+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+netbsd.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/sem.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/mutex.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/hookstd.myr
-	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/spawn.o
-	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/spawn.use
+	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/types.o lib/thread/fsbase.o lib/thread/tls.o lib/thread/tls-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/spawn.o
+	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/types.use lib/thread/fsbase.use lib/thread/tls.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/spawn.use
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/opts.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/syssel.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/libs.myr
--- a/mk/bootstrap/bootstrap+OpenBSD-amd64.sh
+++ b/mk/bootstrap/bootstrap+OpenBSD-amd64.sh
@@ -7,6 +7,7 @@
 	as -g -o mbld/cpufeatures.o mbld/cpufeatures+posixy-x64.s
 	as -g -o lib/thread/exit.o lib/thread/exit+openbsd-x64.s
 	as -g -o lib/thread/atomic-impl.o lib/thread/atomic-impl+x64.s
+	as -g -o lib/thread/tls-impl.o lib/thread/tls-impl+fsbase-x64.s
 	as -g -o lib/std/getbp.o lib/std/getbp+posixy-x64.s
 	$pwd/6/6m -I lib/sys lib/std/option.myr
 	$pwd/6/6m -I lib/sys lib/std/traits.myr
@@ -120,16 +121,19 @@
 	$pwd/6/6m -I lib/std -I lib/sys lib/bio/puti.myr
 	ar -rcs lib/bio/libbio.a lib/bio/puti.o lib/bio/geti.o lib/bio/fd.o lib/bio/mem.o lib/bio/bio.o lib/bio/types.o lib/bio/iter.o
 	$pwd/muse/muse -o lib/bio/libbio.use -p bio lib/bio/puti.use lib/bio/geti.use lib/bio/fd.use lib/bio/mem.use lib/bio/bio.use lib/bio/types.use lib/bio/iter.use
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+openbsd.myr
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu+openbsd.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/common.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/atomic.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/types+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/fsbase+openbsd.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/tls+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+openbsd.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu+openbsd.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/futex+openbsd:6.2.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/sem.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/mutex.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/hookstd.myr
-	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/exit.o lib/thread/futex.o lib/thread/spawn.o
-	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
+	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/types.o lib/thread/fsbase.o lib/thread/tls.o lib/thread/tls-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/exit.o lib/thread/futex.o lib/thread/spawn.o
+	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/types.use lib/thread/fsbase.use lib/thread/tls.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/opts.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/syssel.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/libs.myr
--- a/parse/type.c
+++ b/parse/type.c
@@ -236,8 +236,6 @@
 		bindtype(t->env, param[i]);
 	if (!base->env)
 		base->env = t->env;
-	else 
-		assert(base->env->super == t->env || base->narg > 0);
 	return t;
 }
 
--- a/rt/start-freebsd.s
+++ b/rt/start-freebsd.s
@@ -4,6 +4,10 @@
 sys$__cenvp:
     .quad 0
 
+.globl thread$__tls
+thread$__tls:
+    .fill 88 /* sizeof(tlshdr) + (8 * sizeof(void#)) = 24 + 64 */
+
 .text
 /*
  * The entry point for the whole program.
@@ -11,6 +15,7 @@
  *  - Sets up all argc entries as slices
  *  - Converts argc/argv to a slice
  *  - Stashes a raw envp copy in __cenvp (for syscalls to use)
+ *  - Sets up thread local storage for the main thread
  *  - Calls main()
  */
 .globl _start
@@ -34,6 +39,16 @@
 	pushq	%rax
 	pushq	%rcx
 	call	cvt
+
+	/* set up the intial tls region for the main thread */
+	subq	$0x10,%rsp
+	movq	$165,%rax		/* sysarch */
+	movq	$129,%rdi		/* Archamd64setfs */
+	leaq	thread$__tls(%rip),%rsi
+	movq	%rsi,(%rsp)
+	movq	%rsp,%rsi
+	syscall
+	addq	$0x10,%rsp
 
 	xorq %rbp,%rbp
 	/* call pre-main initializers */
--- a/rt/start-linux.s
+++ b/rt/start-linux.s
@@ -4,6 +4,10 @@
 sys$__cenvp:
     .quad 0
 
+.globl thread$__tls
+thread$__tls:
+    .fill 88 /* sizeof(tlshdr) + (8 * sizeof(void#)) = 24 + 64 */
+
 .text
 /*
  * The entry point for the whole program.
@@ -11,6 +15,7 @@
  *  - Sets up all argc entries as slices
  *  - Converts argc/argv to a slice
  *  - Stashes a raw envp copy in __cenvp (for syscalls to use)
+ *  - Sets up thread local storage for the main thread
  *  - Calls main()
  */
 .globl _start
@@ -35,6 +40,12 @@
 	pushq	%rax
 	pushq	%rcx
 	call	cvt
+
+	/* set up the intial tls region for the main thread */
+	movq	$158,%rax		/* arch_prctl */
+	movq	$0x1002,%rdi		/* Archsetfs */
+	leaq	thread$__tls(%rip),%rsi
+	syscall
 
 	xorq %rbp,%rbp
 	/* call pre-main initializers */
--- a/rt/start-netbsd.s
+++ b/rt/start-netbsd.s
@@ -12,6 +12,10 @@
 sys$__cenvp:
     .quad 0
 
+.globl thread$__tls
+thread$__tls:
+    .fill 88 /* sizeof(tlshdr) + (8 * sizeof(void#)) = 24 + 64 */
+
 .text
 /*
  * The entry point for the whole program.
@@ -19,6 +23,7 @@
  *  - Sets up all argc entries as slices
  *  - Converts argc/argv to a slice
  *  - Stashes a raw envp copy in __cenvp (for syscalls to use)
+ *  - Sets up thread local storage for the main thread
  *  - Calls main()
  */
 .globl _start
@@ -43,6 +48,16 @@
 	pushq	%rax
 	pushq	%rcx
 	call	cvt
+
+	/* set up the intial tls region for the main thread */
+	subq	$0x10,%rsp
+	movq	$165,%rax		/* sysarch */
+	movq	$15,%rdi		/* X8664setfsbase */
+	leaq	thread$__tls(%rip),%rsi
+	movq	%rsi,(%rsp)
+	movq	%rsp,%rsi
+	syscall
+	addq	$0x10,%rsp
 
 	xorq %rbp,%rbp
 	/* call pre-main initializers */
--- a/rt/start-openbsd.s
+++ b/rt/start-openbsd.s
@@ -13,6 +13,10 @@
 sys$__cenvp:
     .quad 0
 
+.globl thread$__tls
+thread$__tls:
+    .fill 88 /* sizeof(tlshdr) + (8 * sizeof(void#)) = 24 + 64 */
+
 .text
 /*
  * The entry point for the whole program.
@@ -20,6 +24,7 @@
  *  - Sets up all argc entries as slices
  *  - Converts argc/argv to a slice
  *  - Stashes a raw envp copy in __cenvp (for syscalls to use)
+ *  - Sets up thread local storage for the main thread
  *  - Calls main()
  */
 .globl _start
@@ -44,6 +49,11 @@
 	pushq	%rax
 	pushq	%rcx
 	call	cvt
+
+	/* set up the intial tls region for the main thread */
+	movq	$329,%rax		/* Sys__set_tcb */
+	leaq	thread$__tls(%rip),%rdi
+	syscall
 
 	xorq %rbp,%rbp
 	/*
--- a/rt/start-osx.s
+++ b/rt/start-osx.s
@@ -4,6 +4,10 @@
 _sys$__cenvp:
     .quad 0
 
+.globl thread$__tls
+thread$__tls:
+    .fill 104 /* sizeof(tlshdr) + (8 * sizeof(void#)) = 40 + 64 */
+
 .text
 /*
  * The entry point for the whole program.
@@ -11,6 +15,7 @@
  *  - Sets up all argc entries as slices
  *  - Converts argc/argv to a slice
  *  - Stashes a raw envp copy in __cenvp (for syscalls to use)
+ *  - Sets up thread local storage for the main thread
  *  - Calls main()
  */
 .globl start
@@ -35,6 +40,12 @@
 	pushq	%rax
 	pushq	%rcx
 	call	cvt
+
+	/* set up the intial tls region for the main thread */
+	movq	$0x3000003,%rax		/* undocumented setgsbase syscall */
+	leaq	thread$__tls(%rip),%rdi
+	movq	%rdi,0x20(%rdi)		/* also store a copy in __tls.self */
+	syscall
 
 	xorq %rbp,%rbp
 	call	___init__
--- a/support/syscall-gen/types+freebsd-x64.frag
+++ b/support/syscall-gen/types+freebsd-x64.frag
@@ -32,6 +32,7 @@
 type cpulevel	= int
 type cpusetid	= int
 type idtype	= int
+type sysarchop	= int
 
 type acltype	= int
 type acltag	= uint32
@@ -795,6 +796,13 @@
 const Sigthr	: signo = 32	/* reserved by thread library. */
 const Siglwp	: signo = Sigthr
 const Siglibrt	: signo = 33	/* reserved by real-time library. */
+
+/* sysarch ops */
+const Archamd64getfs   : sysarchop = 128
+const Archamd64setfs   : sysarchop = 129
+const Archamd64getgs   : sysarchop = 130
+const Archamd64setgs   : sysarchop = 131
+const Archamd64getxfpu : sysarchop = 131
 
 extern const syscall : (sc:scno, args:... -> int64)
 extern var __cenvp : byte##
--- a/support/syscall-gen/types+linux-x64.frag
+++ b/support/syscall-gen/types+linux-x64.frag
@@ -38,6 +38,7 @@
 type mfdflags	= uint32
 type aiocontext	= uint64
 type msg	= void#
+type arch_prctlop	= uint64
 
 
 type clock = union
@@ -583,6 +584,12 @@
 
 /* return value for a failed mapping */
 const Mapbad	: byte# = (-1 : byte#)
+
+/* arch_prctl ops */
+const Archsetgs : arch_prctlop = 0x1001
+const Archsetfs : arch_prctlop = 0x1002
+const Archgetfs : arch_prctlop = 0x1003
+const Archgetgs : arch_prctlop = 0x1004
 
 /* signal flags */
 const Sanocldstop	: sigflags = 0x00000001