ref: 3f842a00016a70d021ddf27a901d564a8e97ab61
author: Ori Bernstein <[email protected]>
date: Mon Dec 28 20:07:18 EST 2020
initial commit
--- /dev/null
+++ b/blk.c
@@ -1,0 +1,52 @@
+#include <u.h>
+#include <libc.h>
+
+#include "dat.h"
+#include "fns.h"
+
+Blk*
+newblk(int t)
+{
+ Blk *b;
+
+ b = emalloc(sizeof(Blk));
+ b->flag |= Bdirty;
+ b->type = t;
+ return b;
+}
+
+Blk*
+getblk(uvlong bp, uvlong bh)
+{
+ Blk *b;
+
+ b = (Blk*)bp;
+ if(blkhash(b) != bh){
+ werrstr("corrupt block %llx\n", bp);
+ fprint(2, "corrupt block %llx\n", bp);
+// abort();
+ }
+ return b;
+}
+
+void
+freeblk(Blk *b)
+{
+ b->refs--;
+ if(b->refs == 0)
+ free(b);
+}
+
+uvlong
+blkhash(Blk *b)
+{
+ return siphash(b->data, Blksz);
+}
+
+int
+putblk(Blk *b)
+{
+ USED(b);
+ /* TODO: unref. */
+ return 0;
+}
--- /dev/null
+++ b/check.c
@@ -1,0 +1,142 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include "dat.h"
+#include "fns.h"
+
+char spc[128];
+int debug;
+
+static int
+invalidblk(Blk *b, int h, Kvp *lo, Kvp *hi)
+{
+ Kvp x, y;
+ int i, r;
+ Blk *c;
+ int fail;
+
+ fail = 0;
+ if(b->type == Leaf){
+ if(h != fs->height){
+ fprint(2, "unbalanced leaf\n");
+ fail++;
+ }
+ if(h != 1 && b->nent < 2){
+ fprint(2, "underfilled leaf\n");
+ fail++;
+ }
+ }
+ if(b->type == Pivot && b->nent < 2){
+ fprint(2, "underfilled pivot\n");
+ fail++;
+ }
+ getval(b, 0, &x);
+ if(lo && keycmp(lo, &x) != 0)
+ fprint(2, "out of range keys %P != %P\n", lo, &x);
+ for(i = 1; i < b->nent; i++){
+ getval(b, i, &y);
+ if(hi && keycmp(&y, hi) >= 0){
+ fprint(2, "out of range keys %P >= %P\n", &y, hi);
+ fail++;
+ }
+ if(b->type == Pivot){
+ c = getblk(x.bp, x.bh);
+ if(invalidblk(c, h + 1, &x, &y))
+ fail++;
+ }
+ r = keycmp(&x, &y);
+ switch(r){
+ case -1:
+ break;
+ case 0:
+ fprint(2, "duplicate keys %P, %P\n", &x, &y);
+ fail++;
+ break;
+ case 1:
+ fprint(2, "misordered keys %P, %P\n", &x, &y);
+ fail++;
+ break;
+ }
+ x = y;
+ }
+ return fail;
+}
+
+
+/* TODO: this will grow into fsck. */
+int
+checkfs(void)
+{
+ return invalidblk(fs->root, 1, nil, nil) == 0;
+}
+
+static void
+rshowblk(Blk *b, int indent)
+{
+ Blk *c;
+ int i;
+ Kvp kv;
+ Msg m;
+
+ if(indent > sizeof(spc)/4)
+ indent = sizeof(spc)/4;
+ if(b == nil){
+ print("NIL\n");
+ return;
+ }
+ if(b->type == Pivot){
+ for(i = 0; i < b->nmsg; i++){
+ getmsg(b, i, &m);
+ print("%.*s|%M\n", 4*indent, spc, &m);
+ }
+ }
+ for(i = 0; i < b->nent; i++){
+ getval(b, i, &kv);
+ print("%.*s|%P\n", 4*indent, spc, &kv);
+ if(b->type == Pivot){
+ if((c = getblk(kv.bp, kv.bh)) == nil)
+ sysfatal("falied load: %r");
+ rshowblk(c, indent + 1);
+ }
+ }
+}
+
+
+void
+initshow(void)
+{
+ int i;
+
+ memset(spc, ' ', sizeof(spc));
+ for(i = 0; i < sizeof(spc); i += 4)
+ spc[i] = '|';
+}
+
+void
+showblk(Blk *b, char *m)
+{
+ if(m != nil)
+ print("=== %s\n", m);
+ rshowblk(b, 0);
+}
+
+void
+showfs(char *m)
+{
+ if(m != nil)
+ print("=== %s\n", m);
+ rshowblk(fs->root, 0);
+}
+
+
+void
+showpath(Path *p, int np)
+{
+ int i;
+
+ for(i = 0; i < np; i++){
+ print("==> b=%p, n=%p, idx=%d\n", p[i].b, p[i].n, p[i].idx);
+ print("\tclear=(%d, %d):%d\n", p[i].lo, p[i].hi, p[i].sz);
+ print("\tl=%p, r=%p, n=%p, split=%d\n", p[i].l, p[i].r, p[i].n, p[i].split);
+ }
+}
--- /dev/null
+++ b/dat.h
@@ -1,0 +1,139 @@
+typedef struct Blk Blk;
+typedef struct Gefs Gefs;
+typedef struct Msg Msg;
+typedef struct Key Key;
+typedef struct Val Val;
+typedef struct Kvp Kvp;
+typedef struct Path Path;
+
+enum {
+ // buffer for the whole block
+ Blksz = 256,
+ // will store what type of block
+ Hdrsz = 16,
+ Blkspc = Blksz - Hdrsz,
+ // space for message buffer
+ Bufspc = Blkspc / 2,
+ // space for pivot keys and offsets
+ Pivspc = Blkspc - Bufspc,
+ Leafspc = Blkspc,
+ Keymax = 16,
+ Inlmax = 64,
+ Ptrsz = 16,
+ Kvmax = Keymax + Inlmax, /* Key and value */
+ Kpmax = Keymax + Ptrsz, /* Key and pointer */
+ Msgmax = 1 + (Kvmax > Kpmax ? Kvmax : Kpmax)
+};
+
+enum {
+ Bdirty = 1 << 0,
+};
+
+#define Efs "i will not buy this fs, it is scratched"
+#define Eexist "does not exist"
+#define Ebotch "protocol botch"
+#define Emode "unknown mode"
+#define Efull "file system full"
+#define Eauth "authentication failed"
+#define Elength "name too long"
+#define Eperm "permission denied"
+
+/*
+ * The type of block. Pivot nodes are internal to the tree,
+ * while leaves inhabit the edges. Pivots are split in half,
+ * containing a buffer for the data, and keys to direct the
+ * searches. Their buffers contain messages en-route to the
+ * leaves.
+ *
+ * Leaves contain the key, and some chunk of data as the
+ * value.
+ */
+enum {
+ Pivot,
+ Leaf,
+};
+
+enum {
+ Vinl, /* Inline value */
+ Vref, /* Block pointer */
+};
+
+enum {
+ Ocreate,
+ Odelete,
+ Owrite,
+ Owstat,
+};
+
+/*
+ * Overall state of the file sytem.
+ * Shadows the superblock contents.
+ */
+struct Gefs {
+ int blksz;
+ int bufsz;
+ int pivsz;
+ int hdrsz;
+
+ int height;
+ Blk *root;
+
+};
+
+struct Key{
+ char *k;
+ int nk;
+};
+
+struct Val {
+ int type;
+ union {
+ /* block pointer */
+ struct {
+ uvlong bp;
+ uvlong bh;
+ };
+ /* inline values */
+ struct {
+ short nv;
+ char *v;
+ };
+ };
+};
+
+struct Kvp {
+ Key;
+ Val;
+};
+
+struct Msg {
+ char op;
+ Kvp;
+};
+
+struct Path {
+ /* Flowing down for flush */
+ Blk *b; /* insertion */
+ int idx; /* insert at */
+ int lo; /* key range */
+ int hi; /* key range */
+ int sz; /* size of range */
+
+ /* Flowing up from flush */
+ Blk *l; /* left of split */
+ Blk *r; /* right of split */
+ Blk *n; /* shadowed node */
+ int split; /* did we split? */
+};
+
+struct Blk {
+ char type;
+ char flag;
+ short nent;
+ short valsz;
+ short nmsg;
+ short bufsz;
+ vlong off; /* -1 for unallocated */
+ int refs; /* TODO: move out */
+ char data[Blksz];
+};
--- /dev/null
+++ b/fns.h
@@ -1,0 +1,44 @@
+#pragma varargck type "M" Msg*
+#pragma varargck type "P" Kvp*
+#pragma varargck type "K" Key*
+#pragma varargck type "V" Val*
+#pragma varargck type "B" Blk*
+
+extern Gefs *fs;
+extern int debug;
+
+void initfs(void);
+
+Blk* newblk(int type);
+Blk* shadow(Blk*, Path*, Path*);
+int putblk(Blk*);
+Blk* getblk(uvlong bp, uvlong bh);
+void freeblk(Blk *b);
+uvlong blkhash(Blk*);
+uvlong siphash(void*, usize);
+
+int fsupsert(Msg*);
+char *fswalk1(Key*, Kvp*);
+
+void* emalloc(usize);
+void* erealloc(void*, usize);
+char* estrdup(char*);
+
+int keycmp(Key *, Key *);
+
+/* for dumping */
+void getval(Blk *, int, Kvp *);
+void getmsg(Blk *, int, Msg *);
+
+void initshow(void);
+void showblk(Blk*, char*);
+void showpath(Path*, int);
+void showfs(char*);
+int checkfs(void);
+
+/* scratch */
+void setmsg(Blk *, int, Msg *);
+void bufinsert(Blk *, Msg *);
+void victim(Blk *b, Path *p);
+void blkinsert(Blk *b, Kvp *kv);
+
--- /dev/null
+++ b/hash.c
@@ -1,0 +1,99 @@
+/*
+ Copyright (c) 2013 Marek Majkowski <[email protected]>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ Original location:
+ https://github.com/majek/csiphash/
+
+ Solution inspired by code from:
+ Samuel Neves (supercop/crypto_auth/siphash24/little)
+ djb (supercop/crypto_auth/siphash24/little2)
+ Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
+*/
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+
+#define _le64toh(x) \
+ GBIT64((char*)&x)
+
+
+#define ROTATE(x, b) (u64int)( ((x) << (b)) | ( (x) >> (64 - (b))) )
+
+#define HALF_ROUND(a,b,c,d,s,t) \
+ a += b; c += d; \
+ b = ROTATE(b, s) ^ a; \
+ d = ROTATE(d, t) ^ c; \
+ a = ROTATE(a, 32);
+
+#define DOUBLE_ROUND(v0,v1,v2,v3) \
+ HALF_ROUND(v0,v1,v2,v3,13,16); \
+ HALF_ROUND(v2,v1,v0,v3,17,21); \
+ HALF_ROUND(v0,v1,v2,v3,13,16); \
+ HALF_ROUND(v2,v1,v0,v3,17,21);
+
+
+u64int
+siphash24(const void *src, unsigned long src_sz, const char key[16]) {
+ const u64int *_key = (u64int *)key;
+ u64int k0 = _le64toh(_key[0]);
+ u64int k1 = _le64toh(_key[1]);
+ u64int b = (u64int)src_sz << 56;
+ const u64int *in = (u64int*)src;
+
+ u64int v0 = k0 ^ 0x736f6d6570736575ULL;
+ u64int v1 = k1 ^ 0x646f72616e646f6dULL;
+ u64int v2 = k0 ^ 0x6c7967656e657261ULL;
+ u64int v3 = k1 ^ 0x7465646279746573ULL;
+
+ while (src_sz >= 8) {
+ u64int mi = _le64toh(*in);
+ in += 1; src_sz -= 8;
+ v3 ^= mi;
+ DOUBLE_ROUND(v0,v1,v2,v3);
+ v0 ^= mi;
+ }
+
+ u64int t = 0; u8int *pt = (u8int *)&t; u8int *m = (u8int *)in;
+ switch (src_sz) {
+ case 7: pt[6] = m[6];
+ case 6: pt[5] = m[5];
+ case 5: pt[4] = m[4];
+ case 4: *((u32int*)&pt[0]) = *((u32int*)&m[0]); break;
+ case 3: pt[2] = m[2];
+ case 2: pt[1] = m[1];
+ case 1: pt[0] = m[0];
+ }
+ b |= _le64toh(t);
+
+ v3 ^= b;
+ DOUBLE_ROUND(v0,v1,v2,v3);
+ v0 ^= b; v2 ^= 0xff;
+ DOUBLE_ROUND(v0,v1,v2,v3);
+ DOUBLE_ROUND(v0,v1,v2,v3);
+ return (v0 ^ v1) ^ (v2 ^ v3);
+}
+
+u64int
+siphash(void *src, usize len)
+{
+ char key[16] = "gefsgefsgefsgefs";
+ return siphash24(src, len, key);
+}
--- /dev/null
+++ b/main.c
@@ -1,0 +1,163 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include "dat.h"
+#include "fns.h"
+
+Gefs *fs;
+
+static int
+Bconv(Fmt *fmt)
+{
+ Blk *b;
+
+ b = va_arg(fmt->args, Blk*);
+ if(b == nil)
+ return fmtprint(fmt, "Blk(nil)");
+ return fmtprint(fmt, "Blk(%c)", (b->type == Pivot) ? 'P' : 'L');
+}
+
+static int
+Mconv(Fmt *fmt)
+{
+ char *opname[] = {
+ [Ocreate] "Ocreate",
+ [Odelete] "Odelete",
+ [Owrite] "Owrite",
+ [Owstat] "Owstat",
+ };
+ Msg *m;
+
+ m = va_arg(fmt->args, Msg*);
+ return fmtprint(fmt, "Msg(%s, %.*s,%.*s)", opname[m->op], m->nk, m->k, m->nv, m->v);
+}
+
+static int
+Pconv(Fmt *fmt)
+{
+ Kvp *kv;
+
+ kv = va_arg(fmt->args, Kvp*);
+ if(kv->type == Vinl)
+ return fmtprint(fmt, "Kvp(%.*s,%.*s)", kv->nk, kv->k, kv->nv, kv->v);
+ else
+ return fmtprint(fmt, "Kvp(%.*s,(%llx,%llx))", kv->nk, kv->k, kv->bp, kv->bh);
+}
+
+static int
+Kconv(Fmt *fmt)
+{
+ Key *k;
+
+ k = va_arg(fmt->args, Key*);
+ return fmtprint(fmt, "Key(%.*s)", k->nk, k->k);
+}
+
+static void
+init(void)
+{
+ initshow();
+ quotefmtinstall();
+ fmtinstall('B', Bconv);
+ fmtinstall('M', Mconv);
+ fmtinstall('P', Pconv);
+ fmtinstall('K', Kconv);
+ fs = emalloc(sizeof(Gefs));
+ fs->root = newblk(Leaf);
+ fs->height = 1;
+}
+
+int
+test(char *path)
+{
+ Biobuf *bfd;
+ char *e, *ln, *f[3];
+ int nf;
+ Msg m;
+ Kvp r;
+ Key k;
+
+ if((bfd = Bopen(path, OREAD)) == nil)
+ sysfatal("open %s: %r", path);
+ while((ln = Brdstr(bfd, '\n', 1)) != nil){
+ memset(f, 0, sizeof(f));
+ nf = tokenize(ln, f, nelem(f));
+ if(nf < 1 || strlen(f[0]) != 1)
+ sysfatal("malformed test file");
+ switch(*f[0]){
+ case '#':
+ break;
+ case 'I':
+ if(nf != 3)
+ sysfatal("malformed insert");
+ m.type = Vinl;
+ m.k = f[1];
+ m.v = f[2];
+ m.op = Ocreate;
+ m.nk = strlen(f[1]);
+ m.nv = strlen(f[2]);
+ print("insert (%s, %s)\n", m.k, m.v);
+ if(fsupsert(&m) == -1){
+ print("failed insert (%s, %s): %r\n", f[1], f[2]);
+ return -1;
+ }
+ break;
+ case 'D':
+ if(nf != 2)
+ sysfatal("malformed delete");
+ m.type = Vinl;
+ m.op = Odelete;
+ m.k = f[1];
+ m.v = nil;
+ m.nk = strlen(f[1]);
+ m.nv = 0;
+ print("delete %s\n", f[1]);
+ if(fsupsert(&m) == -1){
+ print("failed delete (%s): %r\n", f[1]);
+ return -1;
+ }
+ break;
+ case 'G':
+ k.k = f[1];
+ k.nk = strlen(f[1]);
+ e = fswalk1(&k, &r);
+ if(e != nil){
+ print("failed lookup on (%s): %s\n", f[1], e);
+ return -1;
+ }
+ break;
+ case 'S':
+ showfs("fs");
+ print("\n\n");
+ break;
+ case 'C':
+ checkfs();
+ break;
+ case 'V':
+ debug++;
+ break;
+ case 'X':
+ exits(f[1]);
+ break;
+ }
+// if(!checkfs())
+// abort();
+ free(ln);
+ }
+ return 0;
+}
+
+void
+main(int argc, char **argv)
+{
+ int i;
+
+ ARGBEGIN{
+ }ARGEND;
+
+ init();
+ for(i = 0; i < argc; i++)
+ if(test(argv[0]) == -1)
+ sysfatal("test %s: %r\n", argv[i]);
+ exits(nil);
+}
--- /dev/null
+++ b/mkfile
@@ -1,0 +1,16 @@
+</$objtype/mkfile
+
+TARG=gefs
+OFILES=\
+ blk.$O\
+ check.$O\
+ hash.$O\
+ main.$O\
+ tree.$O\
+ util.$O\
+
+HFILES=\
+ dat.h\
+ fns.h
+
+</sys/src/cmd/mkone
--- /dev/null
+++ b/tree.c
@@ -1,0 +1,771 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+
+#include "dat.h"
+#include "fns.h"
+
+int lookup;
+void
+dprint(char *fmt, ...)
+{
+ va_list ap;
+
+ if(!debug)
+ return;
+ va_start(ap, fmt);
+ vfprint(2, fmt, ap);
+ va_end(ap);
+}
+
+int
+keycmp(Key *a, Key *b)
+{
+ int c, n;
+
+ n = (a->nk < b->nk) ? a->nk : b->nk;
+ if((c = memcmp(a->k, b->k, n)) != 0)
+ return c < 0 ? -1 : 1;
+ if(a->nk < b->nk)
+ return -1;
+ else if(a->nk > b->nk)
+ return 1;
+ else
+ return 0;
+}
+
+int
+msgsz(Msg *m)
+{
+ return 1 + 2 + m->nk + 2 + m->nv;
+}
+
+int
+valsz(Kvp *kv)
+{
+ if(kv->type == Vref)
+ return 2+kv->nk + Ptrsz;
+ else
+ return 2+kv->nk + 2+kv->nv;
+}
+
+void
+getval(Blk *b, int i, Kvp *kv)
+{
+ int o;
+
+ assert(i >= 0 && i < b->nent);
+ o = GBIT16(b->data + 2*i);
+ if(b->type == Pivot){
+ kv->type = Vref;
+ kv->nk = GBIT16(b->data + o);
+ kv->k = b->data + o + 2;
+ kv->bp = GBIT64(kv->k + kv->nk + 0);
+ kv->bh = GBIT64(kv->k + kv->nk + 8);
+ }else{
+ kv->type = Vinl;
+ kv->nk = GBIT16(b->data + o);
+ kv->k = b->data + o + 2;
+ kv->nv = GBIT16(kv->k + kv->nk);
+ kv->v = kv->k + kv->nk + 2;
+ }
+}
+
+static void
+setval(Blk *b, int i, Kvp *kv, int replace)
+{
+ int o, nk, nv, ksz, vsz, spc;
+ char *p;
+
+ assert(i >= 0 && i <= b->nent);
+ spc = (b->type == Leaf) ? Leafspc : Pivspc;
+ p = b->data + 2*i;
+ nk = 2 + kv->nk;
+ nv = (kv->type == Vref) ? Ptrsz : 2 + kv->nv;
+ if (i < 0)
+ i = 0;
+ if(!replace || b->nent == i){
+ memmove(p + 2, p, 2*(b->nent - i));
+ b->nent++;
+ b->valsz += nk + nv;
+ o = spc - b->valsz;
+ }else{
+ /*
+ * If we can't put it where it was before,
+ * we need to allocate new space for the
+ * key-value data. It'll get compacted when
+ * we split or merge.
+ */
+ o = GBIT16(b->data + 2*i);
+ ksz = 2 + GBIT16(b->data + o);
+ if(b->type == Leaf)
+ vsz = 2 + GBIT16(b->data + o + ksz);
+ else
+ vsz = 16;
+ if(ksz + vsz < nk + nv){
+ b->valsz += nk + nv;
+ o = spc - b->valsz;
+ }
+ }
+
+ if(2*b->nent + b->valsz > spc)
+ showblk(b, "setval overflow");
+ assert(2*b->nent + b->valsz <= spc);
+ assert(2*b->nent < o);
+ p = b->data + o;
+ if(b->type == Pivot){
+ assert(kv->type == Vref);
+ PBIT16(b->data + 2*i, o);
+ PBIT16(p + 0, kv->nk);
+ memcpy(p + 2, kv->k, kv->nk);
+ PBIT64(p + kv->nk + 2, kv->bp);
+ PBIT64(p + kv->nk + 10, kv->bh);
+ } else {
+ assert(kv->type == Vinl);
+ PBIT16(b->data + 2*i, o);
+ PBIT16(p + 0, kv->nk);
+ memcpy(p + 2, kv->k, kv->nk);
+ PBIT16(p + kv->nk + 2, kv->nv);
+ memcpy(p + kv->nk + 4, kv->v, kv->nv);
+ }
+}
+
+static int
+delval(Blk *b, int i)
+{
+ char *p;
+
+ assert(i >= 0 && i <= b->nent);
+ b->nent--;
+ p = b->data + 2*i;
+ memmove(p, p + 2, 2*(b->nent - i));
+ return 0;
+}
+
+void
+setmsg(Blk *b, int i, Msg *m)
+{
+ char *p;
+ int o;
+
+ assert(b->type == Pivot);
+ assert(i >= 0 && i <= b->nmsg);
+ b->nmsg++;
+ b->bufsz += msgsz(m);
+ assert(2*b->nent + b->bufsz <= Bufspc);
+
+ p = b->data + Pivspc;
+ o = Pivspc - b->bufsz;
+ memmove(p + 2*(i+1), p+2*i, 2*(b->nmsg - i));
+ PBIT16(p + 2*i, o);
+
+ p = b->data + Bufspc + o;
+ *p = m->op;
+ PBIT16(p + 1, m->nk);
+ memcpy(p + 3, m->k, m->nk);
+ PBIT16(p + 3 + m->nk, m->nv);
+ memcpy(p + 5 + m->nk, m->v, m->nv);
+}
+
+void
+getmsg(Blk *b, int i, Msg *m)
+{
+ char *p;
+ int o;
+
+ assert(b->type == Pivot);
+ assert(i >= 0 && i < b->nmsg);
+ o = GBIT16(b->data + Pivspc + 2*i);
+
+ p = b->data + Pivspc + o;
+ m->type = Vinl;
+ m->op = *p;
+ m->nk = GBIT16(p + 1);
+ m->k = p + 3;
+ m->nv = GBIT16(p + 3 + m->nk);
+ m->v = p + 5 + m->nk;
+}
+
+void
+blkinsert(Blk *b, Kvp *kv)
+{
+ int lo, hi, mid, r;
+ Kvp cmp;
+
+ r = -1;
+ lo = 0;
+ hi = b->nent;
+ while(lo < hi){
+ mid = (hi + lo) / 2;
+ getval(b, mid, &cmp);
+ r = keycmp(kv, &cmp);
+ if(r <= 0)
+ hi = mid;
+ else
+ lo = mid + 1;
+ }
+ if(lo < b->nent){
+ getval(b, lo, &cmp);
+ r = keycmp(kv, &cmp);
+ }
+ setval(b, lo, kv, (r == 0));
+}
+
+void
+bufinsert(Blk *b, Msg *m)
+{
+ int lo, hi, mid, r;
+ Msg cmp;
+
+ lo = 0;
+ hi = b->nmsg;
+ while(lo < hi){
+ mid = (hi + lo) / 2;
+ getmsg(b, mid, &cmp);
+ r = keycmp(m, &cmp);
+ if(r <= 0)
+ hi = mid;
+ else
+ lo = mid + 1;
+ }
+ setmsg(b, lo, m);
+}
+
+static int
+bufsearch(Blk *b, Key *k, Msg *m, int *idx)
+{
+ int lo, hi, mid, r;
+
+ lo = 0;
+ hi = b->nmsg - 1;
+ while(lo <= hi){
+ mid = (hi + lo) / 2;
+ getmsg(b, mid, m);
+ r = keycmp(k, m);
+ if(r == 0){
+ *idx = mid;
+ return 0;
+ }
+ if(r < 0)
+ hi = mid - 1;
+ else
+ lo = mid + 1;
+ }
+ return -1;
+}
+
+int
+blkdelete(Blk *b, Kvp *kv)
+{
+ int lo, hi, mid, r;
+ Kvp cmp;
+
+ lo = 0;
+ hi = b->nent - 1;
+ while(lo <= hi){
+ mid = (hi + lo) / 2;
+ getval(b, mid, &cmp);
+ r = keycmp(kv, &cmp);
+ if(r == 0)
+ delval(b, mid);
+ if(r <= 0)
+ hi = mid - 1;
+ else
+ lo = mid + 1;
+ }
+ return -1;
+}
+
+int
+blksearch(Blk *b, Key *k, Kvp *rp, int *idx, int *same)
+{
+ int lo, hi, mid, r;
+ Kvp cmp;
+
+ r = -1;
+ lo = 0;
+ hi = b->nent;
+ while(lo < hi){
+ mid = (hi + lo) / 2;
+ getval(b, mid, &cmp);
+ r = keycmp(k, &cmp);
+ if(r < 0)
+ hi = mid;
+ else
+ lo = mid + 1;
+ }
+ lo--;
+ if(lo >= 0){
+ getval(b, lo, rp);
+ r = keycmp(k, rp);
+ }
+ if(same != nil)
+ *same = (r == 0);
+ if(idx != nil)
+ *idx = lo;
+ return 0;
+}
+int
+filledbuf(Blk *b, int needed)
+{
+ assert(b->type == Pivot);
+ return 2*b->nmsg + b->bufsz > Bufspc - needed;
+}
+
+
+int
+filledleaf(Blk *b, int needed)
+{
+ assert(b->type == Leaf);
+ return 2*b->nent + b->valsz > Leafspc - needed;
+}
+
+int
+filledpiv(Blk *b, int reserve)
+{
+ /*
+ * We need to guarantee there's room for one message
+ * at all times, so that splits along the whole path
+ * have somewhere to go.
+ */
+ assert(b->type == Pivot);
+ return 2*b->nent + b->valsz > Pivspc - reserve*Kpmax;
+}
+
+int
+copyup(Blk *n, int i, Path *pp, int *nbytes)
+{
+ Kvp kv;
+
+
+ if(pp->split){
+ getval(pp->l, 0, &kv);
+ kv.type = Vref;
+ kv.bp = (uintptr)pp->l;
+ kv.bh = blkhash(pp->l);
+ setval(n, i++, &kv, 0);
+ if(nbytes != nil)
+ *nbytes += 2 + valsz(&kv);
+
+ getval(pp->r, 0, &kv);
+ kv.type = Vref;
+ kv.bp = (uintptr)pp->r;
+ kv.bh = blkhash(pp->r);
+ setval(n, i++, &kv, 0);
+ if(nbytes != nil)
+ *nbytes += 2 + valsz(&kv);
+ }else{
+ getval(pp->n, 0, &kv);
+ kv.type = Vref;
+ kv.bp = (uintptr)pp->n;
+ kv.bh = blkhash(pp->n);
+ setval(n, i++, &kv, 1);
+ if(nbytes != nil)
+ *nbytes += 2 + valsz(&kv);
+ }
+ return i;
+}
+
+/*
+ * Creates a new block with the contents of the old
+ * block. When copying the contents, it repacks them
+ * to minimize the space uses, and applies the changes
+ * pending from the downpath blocks.
+ *
+ * When pidx != -1,
+ */
+Blk*
+update(Blk *b, Path *p, Path *pp)
+{
+ int i, j, lo, hi, pidx;
+ Blk *n;
+ Msg m;
+
+ j = 0;
+ lo = (p != nil) ? p->lo : -1;
+ hi = (p != nil) ? p->hi : -1;
+ pidx = (p != nil) ? p->idx : -1;
+ n = newblk(b->type);
+ for(i = 0; i < b->nent; i++){
+ if(i == pidx){
+ j = copyup(n, j, pp, nil);
+ continue;
+ }
+ getval(b, i, &m);
+ setval(n, j++, &m, 0);
+ }
+ if(b->type == Pivot){
+ i = 0;
+ j = 0;
+ while(i < b->nmsg){
+ if(i == lo)
+ i = hi;
+ if(i == b->nmsg)
+ break;
+ getmsg(b, i++, &m);
+ setmsg(n, j++, &m);
+ }
+ b->nmsg = j;
+ }
+ return n;
+}
+
+
+/*
+ * Splits a node, returning the block that msg
+ * would be inserted into. Split must never
+ * grow the total height of the
+ */
+int
+split(Blk *b, Path *p, Path *pp, Kvp *mid)
+{
+ int i, j, copied, halfsz;
+ int lo, hi, pidx;
+ Blk *l, *r;
+ Kvp t;
+ Msg m;
+
+ /*
+ * If the block one entry up the
+ * p is nil, we're at the root,
+ * so we want to make a new block.
+ */
+ l = newblk(b->type);
+ r = newblk(b->type);
+ if(l == nil || r == nil)
+ goto error;
+
+ lo = (p != nil) ? p->lo : -1;
+ hi = (p != nil) ? p->hi : -1;
+ pidx = (p != nil) ? p->idx : -1;
+
+ j = 0;
+ copied = 0;
+ halfsz = (2*b->nent + b->valsz)/2;
+ for(i = 0; copied < halfsz; i++){
+ if(i == pidx){
+ j = copyup(l, j, pp, &copied);
+ continue;
+ }
+ getval(b, i, &t);
+ setval(l, j++, &t, 0);
+ copied += 2 + valsz(&t);
+ }
+ j = 0;
+ getval(b, i, mid);
+ for(; i < b->nent; i++){
+ if(i == pidx){
+ j = copyup(r, j, pp, nil);
+ continue;
+ }
+ getval(b, i, &t);
+ setval(r, j++, &t, 0);
+ }
+ if(b->type == Pivot){
+ i = 0;
+ for(j = 0; i < b->nmsg; i++, j++){
+ if(i == lo)
+ i = hi;
+ if(i == b->nmsg)
+ break;
+ getmsg(b, i, &m);
+ if(keycmp(&m, mid) >= 0)
+ break;
+ setmsg(l, j, &m);
+ }
+ for(j = 0; i < b->nmsg; i++, j++){
+ if(i == lo)
+ i = hi;
+ if(i == b->nmsg)
+ break;
+ getmsg(b, i, &m);
+ setmsg(r, j, &m);
+ }
+ }
+ p->split = 1;
+ p->l = l;
+ p->r = r;
+ return 0;
+error:
+ if(l != nil) freeblk(l);
+ if(r != nil) freeblk(r);
+ return -1;
+}
+
+int
+apply(Blk *b, Msg *m)
+{
+ assert(b->type == Leaf);
+ assert(b->flag & Bdirty);
+ switch(m->op){
+ case Ocreate:
+ blkinsert(b, m);
+ break;
+ case Odelete:
+ blkdelete(b, m);
+ break;
+ case Owrite:
+ werrstr("unimplemented");
+ goto error;
+ }
+ return 0;
+error:
+ werrstr("invalid upsert");
+ return -1;
+}
+
+static int
+flush(Path *path, int npath, Msg *ins, int *redo)
+{
+ static int nins;
+
+ Path *p, *pp;
+ Blk *b;
+ Kvp mid;
+ Msg m;
+ int i, ret;
+
+ ret = -1;
+ /*
+ * The path must contain at minimum two elements:
+ * we must have 1 node we're inserting into, and
+ * an empty element at the top of the path that
+ * we put the new root into if the root gets split.
+ */
+ assert(npath >= 2);
+ *redo = 0;
+ pp = nil;
+ p = &path[npath - 1];
+ if(p->b->type == Leaf){
+ if(!filledleaf(p->b, p[-1].sz)){
+ p->n = update(p->b, p, pp);
+ for(i = p[-1].lo; i < p[-1].hi; i++){
+ getmsg(p[-1].b, i, &m);
+ if(filledleaf(p->n, msgsz(&m)))
+ break;
+ apply(p->n, &m);
+ }
+ }else{
+ if(split(p->b, p, pp, &mid) == -1)
+ goto error;
+ b = p->l;
+ for(i = p[-1].lo; i < p[-1].hi; i++){
+ getmsg(p[-1].b, i, &m);
+ if(keycmp(&m, &mid) >= 0)
+ b = p->r;
+ if(filledleaf(b, msgsz(&m)))
+ continue;
+ if(apply(b, &m) == -1)
+ goto error;
+ }
+ }
+ assert(p->n || (p->l && p->r));
+ pp = p;
+ p--;
+ }
+ for(; p > path; p--){
+ if(!filledpiv(p->b, 1)){
+ p->n = update(p->b, p, pp);
+ for(i = p[-1].lo; i < p[-1].hi; i++){
+ getmsg(p[-1].b, i, &m);
+ if(filledbuf(p->n, msgsz(&m)))
+ break;
+ bufinsert(p->n, &m);
+ }
+ }else{
+ if(split(p->b, p, pp, &mid) == -1)
+ goto error;
+ b = p->l;
+ for(i = p[-1].lo; i < p[-1].hi; i++){
+ getmsg(p[-1].b, i, &m);
+ if(keycmp(&m, &mid) >= 0)
+ b = p->r;
+ if(filledbuf(b, msgsz(&m)))
+ continue;
+ bufinsert(b, &m);
+ }
+ }
+ pp = p;
+ }
+ if(path[1].split){
+ if((path[0].n = newblk(Pivot)) == nil)
+ goto error;
+ copyup(path[0].n, 0, &path[1], nil);
+ bufinsert(path[0].n, ins);
+ }else{
+ if(path[1].b->type == Leaf && !filledleaf(path[1].b, msgsz(ins)))
+ apply(path[1].n, ins);
+ else if(!filledbuf(path[1].n, msgsz(ins)))
+ bufinsert(path[1].n, ins);
+ else
+ *redo = 1;
+ }
+ ret = 0;
+error:
+ for(p = path; p != path + npath; p++){
+ if(p->b != nil)
+ putblk(p->b);
+ if(p->n != nil)
+ putblk(p->n);
+ if(p->l != nil)
+ putblk(p->l);
+ if(p->r != nil)
+ putblk(p->r);
+ }
+ return ret;
+}
+
+/*
+ * Select child node that with the largest message
+ * segment in the current node's buffer.
+ */
+void
+victim(Blk *b, Path *p)
+{
+ int i, j, lo, maxsz, cursz;
+ Kvp kv;
+ Msg m;
+
+ j = 0;
+ lo = 0;
+ maxsz = 0;
+ p->b = b;
+ /*
+ * Start at the second pivot: all values <= this
+ * go to the first node. Stop *after* the last entry,
+ * because entries >= the last entry all go into it.
+ */
+ for(i = 1; i <= b->nent; i++){
+ if(i < b->nent)
+ getval(b, i, &kv);
+ cursz = 0;
+ for(; j < b->nmsg; j++){
+ getmsg(b, j, &m);
+ if(i < b->nent && keycmp(&m, &kv) >= 0)
+ break;
+ /* 2 bytes for offset, plus message size in buffer */
+ cursz += 2 + msgsz(&m);
+ }
+ if(cursz > maxsz){
+ maxsz = cursz;
+ p->lo = lo;
+ p->hi = j;
+ p->sz = maxsz;
+ p->idx = i - 1;
+ lo = j;
+ }
+ }
+}
+
+int
+fsupsert(Msg *m)
+{
+ Blk *b, *n, *s;
+ int npath, redo;
+ Path *path;
+ Kvp sep;
+
+ if(m->nk + 2 > Keymax){
+ werrstr("overlong key");
+ return -1;
+ }
+ /*
+ * The tree can grow in height by 1 when we
+ * split, so we allocate room for one extra
+ * node in the path.
+ */
+again:
+ n = nil;
+ s = nil;
+ redo = 0;
+ npath = 0;
+ path = emalloc((fs->height + 2)*sizeof(Path));
+ path[npath].b = nil;
+ path[npath].idx = -1;
+ npath++;
+
+ b = fs->root;
+ path[0].sz = msgsz(m);
+ while(b->type == Pivot){
+ if(!filledbuf(b, path[npath - 1].sz))
+ break;
+ victim(b, &path[npath]);
+ getval(b, path[npath].idx, &sep);
+ b = getblk(sep.bp, sep.bh);
+ if(b == nil)
+ goto error;
+ npath++;
+ }
+ path[npath].b = b;
+ path[npath].idx = -1;
+ path[npath].lo = 0;
+ path[npath].hi = 0;
+ npath++;
+ if(flush(path, npath, m, &redo) == -1)
+ goto error;
+ b = path[0].n;
+ if(b != nil)
+ fs->height++;
+ else
+ b = path[1].n;
+ fs->root = b;
+ free(path);
+ if(redo)
+ goto again;
+ return 0;
+error:
+ if(n != nil) freeblk(n);
+ if(s != nil) freeblk(s);
+ free(path);
+ return -1;
+}
+
+static char*
+collect(Blk *b, Key *k, Kvp *r, int *done)
+{
+ int i, idx;
+ Msg m;
+
+ *done = 0;
+ if(bufsearch(b, k, &m, &idx) != 0)
+ return nil;
+ for(i = idx; i < b->nmsg; i++){
+ getmsg(b, i, &m);
+ if(keycmp(&m, k) != 0)
+ break;
+ switch(m.op){
+ case Ocreate:
+ *r = m.Kvp;
+ *done = 1;
+ return nil;
+ case Odelete:
+ *done = 1;
+ return Eexist;
+ /* The others don't affect walks */
+ }
+ }
+ return nil;
+}
+
+char*
+fswalk1(Key *k, Kvp *r)
+{
+ int idx, same, done;
+ char *ret;
+ Blk *b;
+
+ b = fs->root;
+ while(b->type == Pivot){
+ ret = collect(b, k, r, &done);
+ if(done)
+ return ret;
+ if(blksearch(b, k, r, &idx, nil) == -1 || idx == -1)
+ return Eexist;
+ if((b = getblk(r->bp, r->bh)) == nil)
+ return Efs;
+ }
+ assert(b->type == Leaf);
+ if(blksearch(b, k, r, nil, &same) == -1 || !same)
+ return Eexist;
+ return nil;
+}
--- /dev/null
+++ b/util.c
@@ -1,0 +1,38 @@
+#include <u.h>
+#include <libc.h>
+#include "dat.h"
+#include "fns.h"
+
+void *
+emalloc(usize n)
+{
+ void *v;
+
+ v = mallocz(n, 1);
+ if(v == nil)
+ sysfatal("malloc: %r");
+ setmalloctag(v, getcallerpc(&n));
+ return v;
+}
+
+void *
+erealloc(void *p, usize n)
+{
+ void *v;
+
+ v = realloc(p, n);
+ if(v == nil)
+ sysfatal("realloc: %r");
+ setmalloctag(v, getcallerpc(&n));
+ return v;
+}
+
+char*
+estrdup(char *s)
+{
+ s = strdup(s);
+ if(s == nil)
+ sysfatal("strdup: %r");
+ setmalloctag(s, getcallerpc(&s));
+ return s;
+}