shithub: gefs

Download patch

ref: 3f842a00016a70d021ddf27a901d564a8e97ab61
author: Ori Bernstein <[email protected]>
date: Mon Dec 28 20:07:18 EST 2020

initial commit

--- /dev/null
+++ b/blk.c
@@ -1,0 +1,52 @@
+#include <u.h>
+#include <libc.h>
+
+#include "dat.h"
+#include "fns.h"
+
+Blk*
+newblk(int t)
+{
+	Blk *b;
+
+	b = emalloc(sizeof(Blk));
+	b->flag |= Bdirty;
+	b->type = t;
+	return b;	
+}
+
+Blk*
+getblk(uvlong bp, uvlong bh)
+{
+	Blk *b;
+
+	b = (Blk*)bp;
+	if(blkhash(b) != bh){
+		werrstr("corrupt block %llx\n", bp);
+		fprint(2, "corrupt block %llx\n", bp);
+//		abort();
+	}
+	return b;
+}
+
+void
+freeblk(Blk *b)
+{
+	b->refs--;
+	if(b->refs == 0)
+		free(b);
+}
+
+uvlong
+blkhash(Blk *b)
+{
+	return siphash(b->data, Blksz);
+}
+
+int
+putblk(Blk *b)
+{
+	USED(b);
+	/* TODO: unref. */
+	return 0;
+}
--- /dev/null
+++ b/check.c
@@ -1,0 +1,142 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include "dat.h"
+#include "fns.h"
+
+char	spc[128];
+int	debug;
+
+static int
+invalidblk(Blk *b, int h, Kvp *lo, Kvp *hi)
+{
+	Kvp x, y;
+	int i, r;
+	Blk *c;
+	int fail;
+
+	fail = 0;
+	if(b->type == Leaf){
+		if(h != fs->height){
+			fprint(2, "unbalanced leaf\n");
+			fail++;
+		}
+		if(h != 1 && b->nent < 2){
+			fprint(2, "underfilled leaf\n");
+			fail++;
+		}
+	}
+	if(b->type == Pivot && b->nent < 2){
+		fprint(2, "underfilled  pivot\n");
+		fail++;
+	}
+	getval(b, 0, &x);
+	if(lo && keycmp(lo, &x) != 0)
+		fprint(2, "out of range keys %P != %P\n", lo, &x);
+	for(i = 1; i < b->nent; i++){
+		getval(b, i, &y);
+		if(hi && keycmp(&y, hi) >= 0){
+			fprint(2, "out of range keys %P >= %P\n", &y, hi);
+			fail++;
+		}
+		if(b->type == Pivot){
+			c = getblk(x.bp, x.bh);
+			if(invalidblk(c, h + 1, &x, &y))
+				fail++;
+		}
+		r = keycmp(&x, &y);
+		switch(r){
+		case -1:
+			break;
+		case 0:
+			fprint(2, "duplicate keys %P, %P\n", &x, &y);
+			fail++;
+			break;
+		case 1:
+			fprint(2, "misordered keys %P, %P\n", &x, &y);
+			fail++;
+			break;
+		}
+		x = y;
+	}
+	return fail;
+}
+	
+
+/* TODO: this will grow into fsck. */
+int
+checkfs(void)
+{
+	return invalidblk(fs->root, 1, nil, nil) == 0;
+}
+
+static void
+rshowblk(Blk *b, int indent)
+{
+	Blk *c;
+	int i;
+	Kvp kv;
+	Msg m;
+
+	if(indent > sizeof(spc)/4)
+		indent = sizeof(spc)/4;
+	if(b == nil){
+		print("NIL\n");
+		return;
+	}
+	if(b->type == Pivot){
+		for(i = 0; i < b->nmsg; i++){
+			getmsg(b, i, &m);
+			print("%.*s|%M\n", 4*indent, spc, &m);
+		}
+	}
+	for(i = 0; i < b->nent; i++){
+		getval(b, i, &kv);
+		print("%.*s|%P\n", 4*indent, spc, &kv);
+		if(b->type == Pivot){
+			if((c = getblk(kv.bp, kv.bh)) == nil)
+				sysfatal("falied load: %r");
+			rshowblk(c, indent + 1);
+		}
+	}
+}
+
+
+void
+initshow(void)
+{
+	int i;
+
+	memset(spc, ' ', sizeof(spc));
+	for(i = 0; i < sizeof(spc); i += 4)
+		spc[i] = '|';
+}
+
+void
+showblk(Blk *b, char *m)
+{
+	if(m != nil)
+		print("=== %s\n", m);
+	rshowblk(b, 0);
+}
+
+void
+showfs(char *m)
+{
+	if(m != nil)
+		print("=== %s\n", m);
+	rshowblk(fs->root, 0);
+}
+
+
+void
+showpath(Path *p, int np)
+{
+	int i;
+
+	for(i = 0; i < np; i++){
+		print("==> b=%p, n=%p, idx=%d\n", p[i].b, p[i].n, p[i].idx);
+		print("\tclear=(%d, %d):%d\n", p[i].lo, p[i].hi, p[i].sz);
+		print("\tl=%p, r=%p, n=%p, split=%d\n", p[i].l, p[i].r, p[i].n, p[i].split);
+	}
+}
--- /dev/null
+++ b/dat.h
@@ -1,0 +1,139 @@
+typedef struct Blk	Blk;
+typedef struct Gefs	Gefs;
+typedef struct Msg	Msg;
+typedef struct Key	Key;
+typedef struct Val	Val;
+typedef struct Kvp	Kvp;
+typedef struct Path	Path;
+
+enum {
+	// buffer for the whole block
+	Blksz	= 256,
+	// will store what type of block
+	Hdrsz	= 16,
+	Blkspc	= Blksz - Hdrsz,
+	// space for message buffer
+	Bufspc  = Blkspc / 2,
+	// space for pivot keys and offsets
+	Pivspc	= Blkspc - Bufspc,
+	Leafspc = Blkspc,
+	Keymax	= 16,
+	Inlmax	= 64,
+	Ptrsz	= 16,
+	Kvmax	= Keymax + Inlmax,	/* Key and value */
+	Kpmax	= Keymax + Ptrsz,	/* Key and pointer */
+	Msgmax  = 1 + (Kvmax > Kpmax ? Kvmax : Kpmax)
+};
+
+enum {
+	Bdirty = 1 << 0,
+};
+
+#define Efs	"i will not buy this fs, it is scratched"
+#define Eexist	"does not exist"
+#define Ebotch	"protocol botch"
+#define Emode	"unknown mode"
+#define Efull	"file system full"
+#define Eauth	"authentication failed"
+#define Elength	"name too long"
+#define Eperm	"permission denied"
+
+/*
+ * The type of block. Pivot nodes are internal to the tree,
+ * while leaves inhabit the edges. Pivots are split in half,
+ * containing a buffer for the data, and keys to direct the
+ * searches. Their buffers contain messages en-route to the
+ * leaves.
+ *
+ * Leaves contain the key, and some chunk of data as the
+ * value.
+ */
+enum {
+	Pivot,
+	Leaf,
+};
+
+enum {
+	Vinl,	/* Inline value */
+	Vref,	/* Block pointer */
+};
+
+enum {
+	Ocreate,
+	Odelete,
+	Owrite,
+	Owstat,
+};
+
+/*
+ * Overall state of the file sytem.
+ * Shadows the superblock contents.
+ */
+struct Gefs {
+	int	blksz;
+	int	bufsz;
+	int	pivsz;
+	int	hdrsz;
+
+	int	height;
+	Blk	*root;
+	
+};
+
+struct Key{
+	char	*k;
+	int	nk;
+};
+
+struct Val {
+	int type;
+	union {
+		/* block pointer */
+		struct {
+			uvlong	bp;
+			uvlong	bh;
+		};
+		/* inline values */
+		struct {
+			short	nv;
+			char	*v;
+		};
+	};
+};
+
+struct Kvp {
+	Key;
+	Val;
+};
+
+struct Msg {
+	char	op;
+	Kvp;
+};
+
+struct Path {
+	/* Flowing down for flush */
+	Blk	*b;	/* insertion */
+	int	idx;	/* insert at */
+	int	lo;	/* key range */
+	int	hi;	/* key range */
+	int	sz;	/* size of range */
+
+	/* Flowing up from flush */
+	Blk	*l;	/* left of split */
+	Blk	*r;	/* right of split */
+	Blk	*n;	/* shadowed node */
+	int	split;	/* did we split? */
+};
+
+struct Blk {
+	char	type;
+	char	flag;
+	short	nent;
+	short	valsz;
+	short   nmsg;
+	short   bufsz;
+	vlong	off;	/* -1 for unallocated */
+	int	refs;	/* TODO: move out */
+	char	data[Blksz];
+};
--- /dev/null
+++ b/fns.h
@@ -1,0 +1,44 @@
+#pragma varargck type "M"	Msg*
+#pragma varargck type "P"	Kvp*
+#pragma varargck type "K"	Key*
+#pragma varargck type "V"	Val*
+#pragma varargck type "B"	Blk*
+
+extern Gefs	*fs;
+extern int	debug;
+
+void	initfs(void);
+
+Blk*	newblk(int type);
+Blk*	shadow(Blk*, Path*, Path*);
+int	putblk(Blk*);
+Blk*	getblk(uvlong bp, uvlong bh);
+void	freeblk(Blk *b);
+uvlong	blkhash(Blk*);
+uvlong	siphash(void*, usize);
+
+int	fsupsert(Msg*);
+char	*fswalk1(Key*, Kvp*);
+
+void*	emalloc(usize);
+void*	erealloc(void*, usize);
+char*	estrdup(char*);
+
+int	keycmp(Key *, Key *);
+
+/* for dumping */
+void	getval(Blk *, int, Kvp *);
+void	getmsg(Blk *, int, Msg *);
+
+void	initshow(void);
+void	showblk(Blk*, char*);
+void	showpath(Path*, int);
+void	showfs(char*);
+int	checkfs(void);
+
+/* scratch */
+void	setmsg(Blk *, int, Msg *);
+void	bufinsert(Blk *, Msg *);
+void victim(Blk *b, Path *p);
+void blkinsert(Blk *b, Kvp *kv);
+
--- /dev/null
+++ b/hash.c
@@ -1,0 +1,99 @@
+/*
+ Copyright (c) 2013  Marek Majkowski <[email protected]>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ Original location:
+    https://github.com/majek/csiphash/
+
+ Solution inspired by code from:
+    Samuel Neves (supercop/crypto_auth/siphash24/little)
+    djb (supercop/crypto_auth/siphash24/little2)
+    Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
+*/
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+
+#define _le64toh(x) \
+	GBIT64((char*)&x)
+
+
+#define ROTATE(x, b) (u64int)( ((x) << (b)) | ( (x) >> (64 - (b))) )
+
+#define HALF_ROUND(a,b,c,d,s,t)			\
+	a += b; c += d;				\
+	b = ROTATE(b, s) ^ a;			\
+	d = ROTATE(d, t) ^ c;			\
+	a = ROTATE(a, 32);
+
+#define DOUBLE_ROUND(v0,v1,v2,v3)		\
+	HALF_ROUND(v0,v1,v2,v3,13,16);		\
+	HALF_ROUND(v2,v1,v0,v3,17,21);		\
+	HALF_ROUND(v0,v1,v2,v3,13,16);		\
+	HALF_ROUND(v2,v1,v0,v3,17,21);
+
+
+u64int
+siphash24(const void *src, unsigned long src_sz, const char key[16]) {
+	const u64int *_key = (u64int *)key;
+	u64int k0 = _le64toh(_key[0]);
+	u64int k1 = _le64toh(_key[1]);
+	u64int b = (u64int)src_sz << 56;
+	const u64int *in = (u64int*)src;
+
+	u64int v0 = k0 ^ 0x736f6d6570736575ULL;
+	u64int v1 = k1 ^ 0x646f72616e646f6dULL;
+	u64int v2 = k0 ^ 0x6c7967656e657261ULL;
+	u64int v3 = k1 ^ 0x7465646279746573ULL;
+
+	while (src_sz >= 8) {
+		u64int mi = _le64toh(*in);
+		in += 1; src_sz -= 8;
+		v3 ^= mi;
+		DOUBLE_ROUND(v0,v1,v2,v3);
+		v0 ^= mi;
+	}
+
+	u64int t = 0; u8int *pt = (u8int *)&t; u8int *m = (u8int *)in;
+	switch (src_sz) {
+	case 7: pt[6] = m[6];
+	case 6: pt[5] = m[5];
+	case 5: pt[4] = m[4];
+	case 4: *((u32int*)&pt[0]) = *((u32int*)&m[0]); break;
+	case 3: pt[2] = m[2];
+	case 2: pt[1] = m[1];
+	case 1: pt[0] = m[0];
+	}
+	b |= _le64toh(t);
+
+	v3 ^= b;
+	DOUBLE_ROUND(v0,v1,v2,v3);
+	v0 ^= b; v2 ^= 0xff;
+	DOUBLE_ROUND(v0,v1,v2,v3);
+	DOUBLE_ROUND(v0,v1,v2,v3);
+	return (v0 ^ v1) ^ (v2 ^ v3);
+}
+
+u64int
+siphash(void *src, usize len)
+{
+	char key[16] = "gefsgefsgefsgefs";
+	return siphash24(src, len, key);
+}
--- /dev/null
+++ b/main.c
@@ -1,0 +1,163 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include "dat.h"
+#include "fns.h"
+
+Gefs *fs;
+
+static int
+Bconv(Fmt *fmt)
+{
+	Blk *b;
+
+	b = va_arg(fmt->args, Blk*);
+	if(b == nil)
+		return fmtprint(fmt, "Blk(nil)");
+	return fmtprint(fmt, "Blk(%c)", (b->type == Pivot) ? 'P' : 'L');
+}
+
+static int
+Mconv(Fmt *fmt)
+{
+	char *opname[] = {
+	[Ocreate]	"Ocreate",
+	[Odelete]	"Odelete",
+	[Owrite]	"Owrite",
+	[Owstat]	"Owstat",
+	};
+	Msg *m;
+
+	m = va_arg(fmt->args, Msg*);
+	return fmtprint(fmt, "Msg(%s, %.*s,%.*s)", opname[m->op], m->nk, m->k, m->nv, m->v);
+}
+
+static int
+Pconv(Fmt *fmt)
+{
+	Kvp *kv;
+
+	kv = va_arg(fmt->args, Kvp*);
+	if(kv->type == Vinl)
+		return fmtprint(fmt, "Kvp(%.*s,%.*s)", kv->nk, kv->k, kv->nv, kv->v);
+	else
+		return fmtprint(fmt, "Kvp(%.*s,(%llx,%llx))", kv->nk, kv->k, kv->bp, kv->bh);
+}
+
+static int
+Kconv(Fmt *fmt)
+{
+	Key *k;
+
+	k = va_arg(fmt->args, Key*);
+	return fmtprint(fmt, "Key(%.*s)", k->nk, k->k);
+}
+
+static void
+init(void)
+{
+	initshow();
+	quotefmtinstall();
+	fmtinstall('B', Bconv);
+	fmtinstall('M', Mconv);
+	fmtinstall('P', Pconv);
+	fmtinstall('K', Kconv);
+	fs = emalloc(sizeof(Gefs));
+	fs->root = newblk(Leaf);
+	fs->height = 1;
+}
+
+int
+test(char *path)
+{
+	Biobuf *bfd;
+	char *e, *ln, *f[3];
+	int nf;
+	Msg m;
+	Kvp r;
+	Key k;
+
+	if((bfd = Bopen(path, OREAD)) == nil)
+		sysfatal("open %s: %r", path);
+	while((ln = Brdstr(bfd, '\n', 1)) != nil){
+		memset(f, 0, sizeof(f));
+		nf = tokenize(ln, f, nelem(f));
+		if(nf < 1 || strlen(f[0]) != 1)
+			sysfatal("malformed test file");
+		switch(*f[0]){
+		case '#':
+			break;
+		case 'I':
+			if(nf != 3)
+				sysfatal("malformed insert");
+			m.type = Vinl;
+			m.k = f[1];
+			m.v = f[2];
+			m.op = Ocreate;
+			m.nk = strlen(f[1]);
+			m.nv = strlen(f[2]);
+			print("insert (%s, %s)\n", m.k, m.v);
+			if(fsupsert(&m) == -1){
+				print("failed insert (%s, %s): %r\n", f[1], f[2]);
+				return -1;
+			}
+			break;
+		case 'D':
+			if(nf != 2)
+				sysfatal("malformed delete");
+			m.type = Vinl;
+			m.op = Odelete;
+			m.k = f[1];
+			m.v = nil;
+			m.nk = strlen(f[1]);
+			m.nv = 0;
+			print("delete %s\n", f[1]);
+			if(fsupsert(&m) == -1){
+				print("failed delete (%s): %r\n", f[1]);
+				return -1;
+			}
+			break;
+		case 'G':
+			k.k = f[1];
+			k.nk = strlen(f[1]);
+			e = fswalk1(&k, &r);
+			if(e != nil){
+				print("failed lookup on (%s): %s\n", f[1], e);
+				return -1;
+			}
+			break;
+		case 'S':
+			showfs("fs");
+			print("\n\n");
+			break;
+		case 'C':
+			checkfs();
+			break;
+		case 'V':
+			debug++;
+			break;
+		case 'X':
+			exits(f[1]);
+			break;
+		}
+//		if(!checkfs())
+//			abort();
+		free(ln);
+	}
+	return 0;
+}
+
+void
+main(int argc, char **argv)
+{
+	int i;
+
+	ARGBEGIN{
+	}ARGEND;
+
+	init();
+	for(i = 0; i < argc; i++)
+		if(test(argv[0]) == -1)
+			sysfatal("test %s: %r\n", argv[i]);
+	exits(nil);
+}
--- /dev/null
+++ b/mkfile
@@ -1,0 +1,16 @@
+</$objtype/mkfile
+
+TARG=gefs
+OFILES=\
+	blk.$O\
+	check.$O\
+	hash.$O\
+	main.$O\
+	tree.$O\
+	util.$O\
+
+HFILES=\
+	dat.h\
+	fns.h
+
+</sys/src/cmd/mkone
--- /dev/null
+++ b/tree.c
@@ -1,0 +1,771 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+
+#include "dat.h"
+#include "fns.h"
+
+int lookup;
+void
+dprint(char *fmt, ...)
+{
+	va_list ap;
+
+	if(!debug)
+		return;
+	va_start(ap, fmt);
+	vfprint(2, fmt, ap);
+	va_end(ap);
+}
+
+int
+keycmp(Key *a, Key *b)
+{
+	int c, n;
+
+	n = (a->nk < b->nk) ? a->nk : b->nk;
+	if((c = memcmp(a->k, b->k, n)) != 0)
+		return c < 0 ? -1 : 1;
+	if(a->nk < b->nk)
+		return -1;
+	else if(a->nk > b->nk)
+		return 1;
+	else
+		return 0;
+}
+
+int
+msgsz(Msg *m)
+{
+	return 1 + 2 + m->nk + 2 + m->nv;
+}
+
+int
+valsz(Kvp *kv)
+{
+	if(kv->type == Vref)
+		return 2+kv->nk + Ptrsz;
+	else
+		return 2+kv->nk + 2+kv->nv;
+}
+
+void
+getval(Blk *b, int i, Kvp *kv)
+{
+	int o;
+
+	assert(i >= 0 && i < b->nent);
+	o = GBIT16(b->data + 2*i);
+	if(b->type == Pivot){
+		kv->type = Vref;
+		kv->nk = GBIT16(b->data + o);
+		kv->k = b->data + o + 2;
+		kv->bp = GBIT64(kv->k + kv->nk + 0);
+		kv->bh = GBIT64(kv->k + kv->nk + 8);
+	}else{
+		kv->type = Vinl;
+		kv->nk = GBIT16(b->data + o);
+		kv->k = b->data + o + 2;
+		kv->nv = GBIT16(kv->k + kv->nk);
+		kv->v = kv->k + kv->nk + 2;
+	}
+}
+
+static void
+setval(Blk *b, int i, Kvp *kv, int replace)
+{
+	int o, nk, nv, ksz, vsz, spc;
+	char *p;
+
+	assert(i >= 0 && i <= b->nent);
+	spc = (b->type == Leaf) ? Leafspc : Pivspc;
+	p = b->data + 2*i;
+	nk = 2 + kv->nk;
+	nv = (kv->type == Vref) ? Ptrsz : 2 + kv->nv;
+	if (i < 0)
+		i = 0;
+	if(!replace || b->nent == i){
+		memmove(p + 2, p, 2*(b->nent - i));
+		b->nent++;
+		b->valsz += nk + nv;
+		o = spc - b->valsz;
+	}else{
+		/*
+		 * If we can't put it where it was before,
+		 * we need to allocate new space for the
+		 * key-value data. It'll get compacted when
+		 * we split or merge.
+		 */
+		o = GBIT16(b->data + 2*i);
+		ksz = 2 + GBIT16(b->data + o);
+		if(b->type == Leaf)
+			vsz = 2 + GBIT16(b->data + o + ksz);
+		else
+			vsz = 16;
+		if(ksz + vsz < nk + nv){
+			b->valsz += nk + nv;
+			o = spc - b->valsz;
+		}
+	}
+
+	if(2*b->nent + b->valsz > spc)
+		showblk(b, "setval overflow");
+	assert(2*b->nent + b->valsz <= spc);
+	assert(2*b->nent < o);
+	p = b->data + o;
+	if(b->type == Pivot){
+		assert(kv->type == Vref);
+		PBIT16(b->data + 2*i, o);
+		PBIT16(p +  0, kv->nk);
+		memcpy(p +  2, kv->k, kv->nk);
+		PBIT64(p + kv->nk +  2, kv->bp);
+		PBIT64(p + kv->nk + 10, kv->bh);
+	} else {
+		assert(kv->type == Vinl);
+		PBIT16(b->data + 2*i, o);
+		PBIT16(p +  0, kv->nk);
+		memcpy(p +  2, kv->k, kv->nk);
+		PBIT16(p + kv->nk + 2, kv->nv);
+		memcpy(p + kv->nk + 4, kv->v, kv->nv);
+	}
+}
+
+static int
+delval(Blk *b, int i)
+{
+	char *p;
+
+	assert(i >= 0 && i <= b->nent);
+	b->nent--;
+	p = b->data + 2*i;
+	memmove(p, p + 2, 2*(b->nent - i));
+	return 0;
+}
+
+void
+setmsg(Blk *b, int i, Msg *m)
+{
+	char *p;
+	int o;
+
+	assert(b->type == Pivot);
+	assert(i >= 0 && i <= b->nmsg);
+	b->nmsg++;
+	b->bufsz += msgsz(m);
+	assert(2*b->nent + b->bufsz <= Bufspc);
+
+	p = b->data + Pivspc;
+	o = Pivspc - b->bufsz;
+	memmove(p + 2*(i+1), p+2*i, 2*(b->nmsg - i));
+	PBIT16(p + 2*i, o);
+
+	p = b->data + Bufspc + o;
+	*p = m->op;
+	PBIT16(p + 1, m->nk);
+	memcpy(p + 3, m->k, m->nk);
+	PBIT16(p + 3 + m->nk, m->nv);
+	memcpy(p + 5 + m->nk, m->v, m->nv);
+}
+
+void
+getmsg(Blk *b, int i, Msg *m)
+{
+	char *p;
+	int o;
+
+	assert(b->type == Pivot);
+	assert(i >= 0 && i < b->nmsg);
+	o = GBIT16(b->data + Pivspc + 2*i);
+
+	p = b->data + Pivspc + o;
+	m->type = Vinl;
+	m->op = *p;
+	m->nk = GBIT16(p + 1);
+	m->k = p + 3;
+	m->nv = GBIT16(p + 3 + m->nk);
+	m->v = p + 5 + m->nk;
+}
+
+void
+blkinsert(Blk *b, Kvp *kv)
+{
+	int lo, hi, mid, r;
+	Kvp cmp;
+
+	r = -1;
+	lo = 0;
+	hi = b->nent;
+	while(lo < hi){
+		mid = (hi + lo) / 2;
+		getval(b, mid, &cmp);
+		r = keycmp(kv, &cmp);
+		if(r <= 0)
+			hi = mid;
+		else
+			lo = mid + 1;
+	}
+	if(lo < b->nent){
+		getval(b, lo, &cmp);
+		r = keycmp(kv, &cmp);
+	}
+	setval(b, lo, kv, (r == 0));
+}
+
+void
+bufinsert(Blk *b, Msg *m)
+{
+	int lo, hi, mid, r;
+	Msg cmp;
+
+	lo = 0;
+	hi = b->nmsg;
+	while(lo < hi){
+		mid = (hi + lo) / 2;
+		getmsg(b, mid, &cmp);
+		r = keycmp(m, &cmp);
+		if(r <= 0)
+			hi = mid;
+		else
+			lo = mid + 1;
+	}
+	setmsg(b, lo, m);
+}
+
+static int
+bufsearch(Blk *b, Key *k, Msg *m, int *idx)
+{
+	int lo, hi, mid, r;
+
+	lo = 0;
+	hi = b->nmsg - 1;
+	while(lo <= hi){
+		mid = (hi + lo) / 2;
+		getmsg(b, mid, m);
+		r = keycmp(k, m);
+		if(r == 0){
+			*idx = mid;
+			return 0;
+		}
+		if(r < 0)
+			hi = mid - 1;
+		else
+			lo = mid + 1;
+	}
+	return -1;
+}
+
+int
+blkdelete(Blk *b, Kvp *kv)
+{
+	int lo, hi, mid, r;
+	Kvp cmp;
+
+	lo = 0;
+	hi = b->nent - 1;
+	while(lo <= hi){
+		mid = (hi + lo) / 2;
+		getval(b, mid, &cmp);
+		r = keycmp(kv, &cmp);
+		if(r == 0)
+			delval(b, mid);
+		if(r <= 0)
+			hi = mid - 1;
+		else
+			lo = mid + 1;
+	}
+	return -1;
+}
+
+int
+blksearch(Blk *b, Key *k, Kvp *rp, int *idx, int *same)
+{
+	int lo, hi, mid, r;
+	Kvp cmp;
+
+	r = -1;
+	lo = 0;
+	hi = b->nent;
+	while(lo < hi){
+		mid = (hi + lo) / 2;
+		getval(b, mid, &cmp);
+		r = keycmp(k, &cmp);
+		if(r < 0)
+			hi = mid;
+		else
+			lo = mid + 1;
+	}
+	lo--;
+	if(lo >= 0){
+		getval(b, lo, rp);
+		r = keycmp(k, rp);
+	}
+	if(same != nil)
+		*same = (r == 0);
+	if(idx != nil)
+		*idx = lo;
+	return 0;
+}
+int
+filledbuf(Blk *b, int needed)
+{
+	assert(b->type == Pivot);
+	return 2*b->nmsg + b->bufsz > Bufspc - needed;
+}
+
+
+int
+filledleaf(Blk *b, int needed)
+{
+	assert(b->type == Leaf);
+	return 2*b->nent + b->valsz > Leafspc - needed;
+}
+
+int
+filledpiv(Blk *b, int reserve)
+{
+	/* 
+	 * We need to guarantee there's room for one message
+	 * at all times, so that splits along the whole path
+	 * have somewhere to go.
+	 */
+	assert(b->type == Pivot);
+	return 2*b->nent + b->valsz > Pivspc - reserve*Kpmax;
+}
+
+int
+copyup(Blk *n, int i, Path *pp, int *nbytes)
+{
+	Kvp kv;
+
+	
+	if(pp->split){
+		getval(pp->l, 0, &kv);
+		kv.type = Vref;
+		kv.bp = (uintptr)pp->l;
+		kv.bh = blkhash(pp->l);
+		setval(n, i++, &kv, 0);
+		if(nbytes != nil)
+			*nbytes += 2 + valsz(&kv);
+
+		getval(pp->r, 0, &kv);
+		kv.type = Vref;
+		kv.bp = (uintptr)pp->r;
+		kv.bh = blkhash(pp->r);
+		setval(n, i++, &kv, 0);
+		if(nbytes != nil)
+			*nbytes += 2 + valsz(&kv);
+	}else{
+		getval(pp->n, 0, &kv);
+		kv.type = Vref;
+		kv.bp = (uintptr)pp->n;
+		kv.bh = blkhash(pp->n);
+		setval(n, i++, &kv, 1);
+		if(nbytes != nil)
+			*nbytes += 2 + valsz(&kv);
+	}
+	return i;
+}
+
+/*
+ * Creates a new block with the contents of the old
+ * block. When copying the contents, it repacks them
+ * to minimize the space uses, and applies the changes
+ * pending from the downpath blocks.
+ *
+ * When pidx != -1, 
+ */
+Blk*
+update(Blk *b, Path *p, Path *pp)
+{
+	int i, j, lo, hi, pidx;
+	Blk *n;
+	Msg m;
+
+	j = 0;
+	lo = (p != nil) ? p->lo : -1;
+	hi = (p != nil) ? p->hi : -1;
+	pidx = (p != nil) ? p->idx : -1;
+	n = newblk(b->type);
+	for(i = 0; i < b->nent; i++){
+		if(i == pidx){
+			j = copyup(n, j, pp, nil);
+			continue;
+		}
+		getval(b, i, &m);
+		setval(n, j++, &m, 0);
+	}
+	if(b->type == Pivot){
+		i = 0;
+		j = 0;
+		while(i < b->nmsg){
+			if(i == lo)
+				i = hi;
+			if(i == b->nmsg)
+				break;
+			getmsg(b, i++, &m);
+			setmsg(n, j++, &m);
+		}
+		b->nmsg = j;
+	}
+	return n;
+}
+
+
+/*
+ * Splits a node, returning the block that msg
+ * would be inserted into. Split must never
+ * grow the total height of the 
+ */
+int
+split(Blk *b, Path *p, Path *pp, Kvp *mid)
+{
+	int i, j, copied, halfsz;
+	int lo, hi, pidx;
+	Blk *l, *r;
+	Kvp t;
+	Msg m;
+
+	/*
+	 * If the block one entry up the
+	 * p is nil, we're at the root,
+	 * so we want to make a new block.
+	 */
+	l = newblk(b->type);
+	r = newblk(b->type);
+	if(l == nil || r == nil)
+		goto error;
+
+	lo = (p != nil) ? p->lo : -1;
+	hi = (p != nil) ? p->hi : -1;
+	pidx = (p != nil) ? p->idx : -1;
+
+	j = 0;
+	copied = 0;
+	halfsz = (2*b->nent + b->valsz)/2;
+	for(i = 0; copied < halfsz; i++){
+		if(i == pidx){
+			j = copyup(l, j, pp, &copied);
+			continue;
+		}
+		getval(b, i, &t);
+		setval(l, j++, &t, 0);
+		copied += 2 + valsz(&t);
+	}
+	j = 0;
+	getval(b, i, mid);
+	for(; i < b->nent; i++){
+		if(i == pidx){
+			j = copyup(r, j, pp, nil);
+			continue;
+		}
+		getval(b, i, &t);
+		setval(r, j++, &t, 0);
+	}
+	if(b->type == Pivot){
+		i = 0;
+		for(j = 0; i < b->nmsg; i++, j++){
+			if(i == lo)
+				i = hi;
+			if(i == b->nmsg)
+				break;
+			getmsg(b, i, &m);
+			if(keycmp(&m, mid) >= 0)
+				break;
+			setmsg(l, j, &m);
+		}
+		for(j = 0; i < b->nmsg; i++, j++){
+			if(i == lo)
+				i = hi;
+			if(i == b->nmsg)
+				break;
+			getmsg(b, i, &m);
+			setmsg(r, j, &m);
+		}
+	}
+	p->split = 1;
+	p->l = l;
+	p->r = r;
+	return 0;
+error:
+	if(l != nil) freeblk(l);
+	if(r != nil) freeblk(r);
+	return -1;
+}
+
+int
+apply(Blk *b, Msg *m)
+{
+	assert(b->type == Leaf);
+	assert(b->flag & Bdirty);
+	switch(m->op){
+	case Ocreate:
+		blkinsert(b, m);
+		break;
+	case Odelete:
+		blkdelete(b, m);
+		break;
+	case Owrite:
+		werrstr("unimplemented");
+		goto error;
+	}
+	return 0;
+error:
+	werrstr("invalid upsert");
+	return -1;
+}
+
+static int
+flush(Path *path, int npath, Msg *ins, int *redo)
+{
+	static int nins;
+
+	Path *p, *pp;
+	Blk *b;
+	Kvp mid;
+	Msg m;
+	int i, ret;
+
+	ret = -1;
+	/*
+	 * The path must contain at minimum two elements:
+	 * we must have 1 node we're inserting into, and
+	 * an empty element at the top of the path that
+	 * we put the new root into if the root gets split.
+	 */
+	assert(npath >= 2);
+	*redo = 0;
+	pp = nil;
+	p = &path[npath - 1];
+	if(p->b->type == Leaf){
+		if(!filledleaf(p->b, p[-1].sz)){
+			p->n = update(p->b, p, pp);
+			for(i = p[-1].lo; i < p[-1].hi; i++){
+				getmsg(p[-1].b, i, &m);
+				if(filledleaf(p->n, msgsz(&m)))
+					break;
+				apply(p->n, &m);
+			}
+		}else{
+			if(split(p->b, p, pp, &mid) == -1)
+				goto error;
+			b = p->l;
+			for(i = p[-1].lo; i < p[-1].hi; i++){
+				getmsg(p[-1].b, i, &m);
+				if(keycmp(&m, &mid) >= 0)
+					b = p->r;
+				if(filledleaf(b, msgsz(&m)))
+					continue;
+				if(apply(b, &m) == -1)
+					goto error;
+			}
+		}
+		assert(p->n || (p->l && p->r));
+		pp = p;
+		p--;
+	}
+	for(; p > path; p--){
+		if(!filledpiv(p->b, 1)){
+			p->n = update(p->b, p, pp);
+			for(i = p[-1].lo; i < p[-1].hi; i++){
+				getmsg(p[-1].b, i, &m);
+				if(filledbuf(p->n, msgsz(&m)))
+					break;
+				bufinsert(p->n, &m);
+			}
+		}else{
+			if(split(p->b, p, pp, &mid) == -1)
+				goto error;
+			b = p->l;
+			for(i = p[-1].lo; i < p[-1].hi; i++){
+				getmsg(p[-1].b, i, &m);
+				if(keycmp(&m, &mid) >= 0)
+					b = p->r;
+				if(filledbuf(b, msgsz(&m)))
+					continue;
+				bufinsert(b, &m);
+			}
+		}
+		pp = p;
+	}
+	if(path[1].split){
+		if((path[0].n = newblk(Pivot)) == nil)
+			goto error;
+		copyup(path[0].n, 0, &path[1], nil);
+		bufinsert(path[0].n, ins);
+	}else{
+		if(path[1].b->type == Leaf && !filledleaf(path[1].b, msgsz(ins)))
+			apply(path[1].n, ins);
+		else if(!filledbuf(path[1].n, msgsz(ins)))
+			bufinsert(path[1].n, ins);
+		else
+			*redo = 1;
+	}
+	ret = 0;
+error:
+	for(p = path; p != path + npath; p++){
+		if(p->b != nil)
+			putblk(p->b);
+		if(p->n != nil)
+			putblk(p->n);
+		if(p->l != nil)
+			putblk(p->l);
+		if(p->r != nil)
+			putblk(p->r);
+	}
+	return ret;
+}
+
+/*
+ * Select child node that with the largest message
+ * segment in the current node's buffer.
+ */
+void
+victim(Blk *b, Path *p)
+{
+	int i, j, lo, maxsz, cursz;
+	Kvp kv;
+	Msg m;
+
+	j = 0;
+	lo = 0;
+	maxsz = 0;
+	p->b = b;
+	/* 
+	 * Start at the second pivot: all values <= this
+	 * go to the first node. Stop *after* the last entry,
+	 * because entries >= the last entry all go into it.
+	 */
+	for(i = 1; i <= b->nent; i++){
+		if(i < b->nent)
+			getval(b, i, &kv);
+		cursz = 0;
+		for(; j < b->nmsg; j++){
+			getmsg(b, j, &m);
+			if(i < b->nent && keycmp(&m, &kv) >= 0)
+				break;
+			/* 2 bytes for offset, plus message size in buffer */
+			cursz += 2 + msgsz(&m);
+		}
+		if(cursz > maxsz){
+			maxsz = cursz;
+			p->lo = lo;
+			p->hi = j;
+			p->sz = maxsz;
+			p->idx = i - 1;
+			lo = j;
+		}
+	}
+}
+
+int
+fsupsert(Msg *m)
+{
+	Blk *b, *n, *s;
+	int npath, redo;
+	Path *path;
+	Kvp sep;
+
+	if(m->nk + 2 > Keymax){
+		werrstr("overlong key");
+		return -1;
+	}
+	/*
+	 * The tree can grow in height by 1 when we
+	 * split, so we allocate room for one extra
+	 * node in the path.
+	 */
+again:
+	n = nil;
+	s = nil;
+	redo = 0;
+	npath = 0;
+	path = emalloc((fs->height + 2)*sizeof(Path));
+	path[npath].b = nil;
+	path[npath].idx = -1;
+	npath++;
+
+	b = fs->root;
+	path[0].sz = msgsz(m);
+	while(b->type == Pivot){
+		if(!filledbuf(b, path[npath - 1].sz))
+			break;
+		victim(b, &path[npath]);
+		getval(b, path[npath].idx, &sep);
+		b = getblk(sep.bp, sep.bh);
+		if(b == nil)
+			goto error;
+		npath++;
+	}
+	path[npath].b = b;
+	path[npath].idx = -1;
+	path[npath].lo = 0;
+	path[npath].hi = 0;
+	npath++;
+	if(flush(path, npath, m, &redo) == -1)
+		goto error;
+	b = path[0].n;
+	if(b != nil)
+		fs->height++;
+	else
+		b = path[1].n;
+	fs->root = b;
+	free(path);
+	if(redo)
+		goto again;
+	return 0;
+error:
+	if(n != nil) freeblk(n);
+	if(s != nil) freeblk(s);
+	free(path);
+	return -1;
+}
+
+static char*
+collect(Blk *b, Key *k, Kvp *r, int *done)
+{
+	int i, idx;
+	Msg m;
+
+	*done = 0;
+	if(bufsearch(b, k, &m, &idx) != 0)
+		return nil;
+	for(i = idx; i < b->nmsg; i++){
+		getmsg(b, i, &m);
+		if(keycmp(&m, k) != 0)
+			break;
+		switch(m.op){
+		case Ocreate:
+			*r = m.Kvp;
+			*done = 1;
+			return nil;
+		case Odelete:
+			*done = 1;
+			return Eexist;
+		/* The others don't affect walks */
+		}
+	}
+	return nil;
+}
+
+char*
+fswalk1(Key *k, Kvp *r)
+{
+	int idx, same, done;
+	char *ret;
+	Blk *b;
+
+	b = fs->root;
+	while(b->type == Pivot){
+		ret = collect(b, k, r, &done);
+		if(done)
+			return ret;
+		if(blksearch(b, k, r, &idx, nil) == -1 || idx == -1)
+			return Eexist;
+		if((b = getblk(r->bp, r->bh)) == nil)
+			return Efs;
+	}
+	assert(b->type == Leaf);
+	if(blksearch(b, k, r, nil, &same) == -1 || !same)
+		return Eexist;
+	return nil;
+}
--- /dev/null
+++ b/util.c
@@ -1,0 +1,38 @@
+#include <u.h>
+#include <libc.h>
+#include "dat.h"
+#include "fns.h"
+
+void *
+emalloc(usize n)
+{
+	void *v;
+	
+	v = mallocz(n, 1);
+	if(v == nil)
+		sysfatal("malloc: %r");
+	setmalloctag(v, getcallerpc(&n));
+	return v;
+}
+
+void *
+erealloc(void *p, usize n)
+{
+	void *v;
+	
+	v = realloc(p, n);
+	if(v == nil)
+		sysfatal("realloc: %r");
+	setmalloctag(v, getcallerpc(&n));
+	return v;
+}
+
+char*
+estrdup(char *s)
+{
+	s = strdup(s);
+	if(s == nil)
+		sysfatal("strdup: %r");
+	setmalloctag(s, getcallerpc(&s));
+	return s;
+}