shithub: gefs

Download patch

ref: ed97b6fdc1a4acd4216b748cf82fe3ba35cb1bb1
parent: 2d851cca16a65146bc6d38c6d54418d20cb38d04
author: Ori Bernstein <[email protected]>
date: Sun Oct 16 12:56:04 EDT 2022

everything: rework buffer lifetimes completely

--- a/blk.c
+++ b/blk.c
@@ -23,8 +23,6 @@
 static vlong	blkalloc_lk(Arena*);
 static vlong	blkalloc(int);
 static int	blkdealloc_lk(vlong);
-static Blk*	blkbuf(void);
-static void	blkfree(Blk*);
 static Blk*	initblk(Blk*, vlong, int);
 static int	logop(Arena *, vlong, vlong, int);
 
@@ -66,7 +64,7 @@
 syncblk(Blk *b)
 {
 	assert(checkflag(b, Bfinal));
-	clrflag(b, Bqueued|Bdirty);
+	clrflag(b, Bdirty);
 	return pwrite(fs->fd, b->buf, Blksz, b->bp.addr);
 }
 
@@ -77,20 +75,20 @@
 	vlong off, rem, n;
 
 	assert(bp != -1);
-	if((b = blkbuf()) == nil)
+	if((b = cachepluck()) == nil)
 		return nil;
+	b->alloced = getcallerpc(&bp);
 	off = bp;
 	rem = Blksz;
 	while(rem != 0){
 		n = pread(fs->fd, b->buf, rem, off);
 		if(n <= 0){
-			blkfree(b);
+			free(b);
 			return nil;
 		}
 		off += n;
 		rem -= n;
 	}
-	b->ref = 1;
 	b->cnext = nil;
 	b->cprev = nil;
 	b->hnext = nil;
@@ -136,6 +134,7 @@
 		b->valsz = GBIT16(b->buf+4);
 		break;
 	}
+	assert(b->magic == Magic);
 	return b;
 }
 
@@ -144,7 +143,7 @@
 {
 	int n;
 
-	n = hint+ainc(&fs->roundrobin)/(64*1024);
+	n = hint+ainc(&fs->roundrobin)/(1024*1024);
 	return &fs->arenas[n%fs->narena];
 }
 
@@ -271,16 +270,16 @@
 		pb = lb;
 		if((o = blkalloc_lk(a)) == -1)
 			return -1;
-		if((lb = blkbuf()) == nil)
+		if((lb = cachepluck()) == nil)
 			return -1;
 		initblk(lb, o, Tlog);
-		cacheblk(lb);
+		cacheins(lb);
 		lb->logsz = Loghashsz;
 		p = lb->data + lb->logsz;
 		PBIT64(p, (uvlong)LogEnd);
 		finalize(lb);
 		if(syncblk(lb) == -1){
-			putblk(lb);
+			dropblk(lb);
 			return -1;
 		}
 
@@ -289,11 +288,10 @@
 			PBIT64(p, lb->bp.addr|LogChain);
 			finalize(pb);
 			if(syncblk(pb) == -1){
-				putblk(pb);
+				dropblk(pb);
 				return -1;
 			}
-			lrubump(pb);
-			putblk(pb);
+			dropblk(pb);
 		}
 		*tl = lb;
 	}
@@ -371,13 +369,13 @@
 		switch(op){
 		case LogEnd:
 			dprint("log@%d: end\n", i);
-			putblk(b);
+			dropblk(b);
 			return 0;
 		case LogChain:
 			bp.addr = off & ~0xff;
 			bp.hash = -1;
 			bp.gen = -1;
-			putblk(b);
+			dropblk(b);
 			dprint("log@%d: chain %B\n", i, bp);
 			goto Nextblk;
 			break;
@@ -431,7 +429,7 @@
 	 */
 	if((ba = blkalloc_lk(a)) == -1)
 		return -1;
-	if((b = blkbuf()) == nil)
+	if((b = cachepluck()) == nil)
 		return -1;
 	initblk(b, ba, Tlog);
 	b->logsz = Loghashsz;
@@ -440,7 +438,7 @@
 	PBIT64(p, (uvlong)LogEnd);
 	finalize(b);
 	if(syncblk(b) == -1){
-		putblk(b);
+		dropblk(b);
 		return -1;
 	}
 
@@ -448,7 +446,7 @@
 	if(a->tail != nil){
 		finalize(a->tail);
 		if(syncblk(a->tail) == -1){
-			blkfree(b);
+			free(b);
 			return -1;
 		}
 	}
@@ -528,13 +526,13 @@
 					break;
 				}
 			}
-			putblk(b);
 			lock(a);
-			cachedel(bp.addr);
+			cachedel(b->bp.addr);
 			if(blkdealloc_lk(ba) == -1){
 				unlock(a);
 				return -1;
 			}
+			dropblk(b);
 			unlock(a);
 		}
 	}
@@ -639,61 +637,23 @@
 }
 
 static Blk*
-blkbuf(void)
-{
-	uvlong *p;
-	Blk *b;
-
-	qlock(&fs->freelk);
-	while(fs->free == nil)
-		rsleep(&fs->freerz);
-	b = fs->free;
-	fs->free = b->fnext;
-
-	/* check for corruption */
-	p = (uvlong*)b - 1;
-	assert(*p == HdMagic);
-
-	p = (uvlong*)(b + 1);
-	assert(*p == TlMagic);
-	qunlock(&fs->freelk);
-
-	/*
-	 * If the block is cached,
-	 * then the cache holds a ref
-	 * to the block, so we only
-	 * want to reset the refs
-	 * on an allocation.
-	 */
-	b->ref = 1;
-	b->cnext = nil;
-	b->cprev = nil;
-	b->hnext = nil;
-	b->flag = 0;
-
-	return b;
-}
-
-static void
-blkfree(Blk *b)
-{
-	b->bp.addr = -1;
-	qlock(&fs->freelk);
-	b->fnext = fs->free;
-	fs->free = b;
-	rwakeup(&fs->freerz);
-	qunlock(&fs->freelk);
-}
-
-static Blk*
 initblk(Blk *b, vlong bp, int t)
 {
-	assert(lookupblk(bp) == nil);
+	Blk *ob;
+
+	ob = cacheget(bp);
+	if(ob != nil){
+		fprint(2, "dup block: %#p %B (alloced %#llx freed %#llx lasthold: %#llx, lastdrop: %#llx)\n",
+			ob, ob->bp, ob->alloced, ob->freed, ob->lasthold, ob->lastdrop);
+		abort();
+	}
 	b->type = t;
 	b->bp.addr = bp;
 	b->bp.hash = -1;
 	b->bp.gen = fs->nextgen;
+	lock(&fs->freelk);
 	b->qgen = fs->qgen;
+	unlock(&fs->freelk);
 	switch(t){
 	case Traw:
 	case Tarena:
@@ -719,8 +679,9 @@
 	b->bufsz = 0;
 	b->logsz = 0;
 	b->lognxt = 0;
+	b->alloced = getcallerpc(&b);
 
-	return cacheblk(b);
+	return b;
 }
 
 Blk*
@@ -731,12 +692,10 @@
 
 	if((bp = blkalloc(t)) == -1)
 		return nil;
-	if((b = blkbuf()) == nil)
+	if((b = cachepluck()) == nil)
 		return nil;
 	initblk(b, bp, t);
-	cacheblk(b);
 	b->alloced = getcallerpc(&t);
-	assert(b->ref == 2);
 	return b;
 }
 
@@ -756,8 +715,8 @@
 	r->bufsz = b->bufsz;
 	r->logsz = b->logsz;
 	r->lognxt = b->lognxt;
+	r->alloced = getcallerpc(&b);
 	memcpy(r->buf, b->buf, sizeof(r->buf));
-	b->alloced = getcallerpc(&b);
 	return r;
 }
 
@@ -766,7 +725,6 @@
 {
 	uvlong h;
 
-	setflag(b, Bfinal);
 	if(b->type != Traw)
 		PBIT16(b->buf, b->type);
 	switch(b->type){
@@ -795,6 +753,9 @@
 	case Tarena:
 		break;
 	}
+
+	setflag(b, Bfinal);
+	cacheins(b);
 }
 
 Blk*
@@ -806,9 +767,7 @@
 
 	i = ihash(bp.addr) % nelem(fs->blklk);
 	qlock(&fs->blklk[i]);
-	if((b = lookupblk(bp.addr)) != nil){
-		cacheblk(b);
-		lrubump(b);
+	if((b = cacheget(bp.addr)) != nil){
 		qunlock(&fs->blklk[i]);
 		return b;
 	}
@@ -815,11 +774,11 @@
 	if((b = readblk(bp.addr, flg)) == nil){
 		qunlock(&fs->blklk[i]);
 		return nil;
-	}else
-		b->alloced = getcallerpc(&bp);
+	}
+	b->alloced = getcallerpc(&bp);
 	h = blkhash(b);
 	if((flg&GBnochk) == 0 && h != bp.hash){
-		fprint(2, "corrupt block %B: %.16llux != %.16llux\n", bp, h, bp.hash);
+		fprint(2, "corrupt block %p %B: %.16llux != %.16llux\n", b, bp, h, bp.hash);
 		qunlock(&fs->blklk[i]);
 		abort();
 		return nil;
@@ -826,8 +785,7 @@
 	}
 	b->bp.hash = h;
 	b->bp.gen = bp.gen;
-	cacheblk(b);
-	lrubump(b);
+	cacheins(b);
 	qunlock(&fs->blklk[i]);
 
 	return b;
@@ -834,12 +792,33 @@
 }
 
 Blk*
-refblk(Blk *b)
+holdblk(Blk *b)
 {
 	ainc(&b->ref);
+	b->lasthold = getcallerpc(&b);
 	return b;
 }
 
+void
+dropblk(Blk *b)
+{
+	assert(b == nil || b->ref > 0);
+	if(b == nil || adec(&b->ref) != 0)
+		return;
+	b->lastdrop = getcallerpc(&b);
+//	assert(b->cprev == nil && b->cnext == nil);
+	/*
+	 * While a freed block can get resurrected
+	 * before quiescence, it's unlikely -- so
+	 * it goes into the bottom of the LRU to
+	 * get selected early for reuse.
+	 */
+	if(checkflag(b, Bfreed))
+		lrubot(b);
+	else
+		lrutop(b);
+}
+
 ushort
 blkfill(Blk *b)
 {
@@ -856,21 +835,10 @@
 }
 
 void
-putblk(Blk *b)
-{
-	if(b == nil || adec(&b->ref) != 0)
-		return;
-	assert(!checkflag(b, Bcached));
-	assert(checkflag(b, Bfreed) || !checkflag(b, Bdirty));
-	blkfree(b);
-}
-
-void
 freebp(Tree *t, Bptr bp)
 {
 	Bfree *f;
 
-	dprint("[%s] free blk %B\n", (t == &fs->snap) ? "snap" : "data", bp);
 	if(t != nil && t != &fs->snap && bp.gen <= t->gen){
 		killblk(t, bp);
 		return;
@@ -878,41 +846,41 @@
 	if((f = malloc(sizeof(Bfree))) == nil)
 		return;
 	f->bp = bp;
-	lock(&fs->dealloclk);
-	f->next = fs->deallochd;
-	fs->deallochd = f;
-	unlock(&fs->dealloclk);
+	lock(&fs->freelk);
+	f->next = fs->freehd;
+	fs->freehd = f;
+	unlock(&fs->freelk);
 }
 
 void
 freeblk(Tree *t, Blk *b)
 {
-	b->freed = getcallerpc(&b);
+	b->freed = getcallerpc(&t);
 	setflag(b, Bfreed);
 	freebp(t, b->bp);
 }
 
 void
-reclaimblk(Bptr bp)
+epochstart(int tid)
 {
-	Arena *a;
+	ainc((long*)&fs->active[tid]);
+}
 
-	a = getarena(bp.addr);
-	lock(a);
-	cachedel(bp.addr);
-	blkdealloc_lk(bp.addr);
-	unlock(a);
+void
+epochend(int tid)
+{
+	ainc((long*)&fs->active[tid]);
 }
 
 void
-quiesce(int tid)
+epochclean(void)
 {
 	int i, allquiesced;
 	Bfree *p, *n;
+	Arena *a;
 
 	lock(&fs->activelk);
 	allquiesced = 1;
-	fs->active[tid]++;
 	for(i = 0; i < fs->nquiesce; i++){
 		/*
 		 * Odd parity on quiescence implies
@@ -921,30 +889,37 @@
 		 * that enters us into the critical
 		 * section.
 		 */
-		if((fs->active[i] & 1) == 0)
+		if((fs->active[i] & 1) != 0)
 			continue;
 		if(fs->active[i] == fs->lastactive[i])
 			allquiesced = 0;
 	}
+
 	p = nil;
 	if(allquiesced){
-		inc64(&fs->qgen, 1);
 		for(i = 0; i < fs->nquiesce; i++)
 			fs->lastactive[i] = fs->active[i];
 
-		lock(&fs->dealloclk);
-		if(fs->deallocp != nil){
-			p = fs->deallocp->next;
-			fs->deallocp->next = nil;
+		lock(&fs->freelk);
+		fs->qgen++;
+		if(fs->freep != nil){
+			p = fs->freep->next;
+			fs->freep->next = nil;
 		}
-		fs->deallocp = fs->deallochd;
-		unlock(&fs->dealloclk);
+		fs->freep = fs->freehd;
+		unlock(&fs->freelk);
 	}
 	unlock(&fs->activelk);
 
 	while(p != nil){
 		n = p->next;
-		reclaimblk(p->bp);
+		a = getarena(p->bp.addr);
+
+		lock(a);
+		cachedel(p->bp.addr);
+		blkdealloc_lk(p->bp.addr);
+		unlock(a);
+
 		free(p);
 		p = n;
 	}
@@ -967,7 +942,7 @@
 
 	a = getarena(b->bp.addr);
 	assert(checkflag(b, Bdirty));
-	refblk(b);
+	holdblk(b);
 	finalize(b);
 	chsend(a->sync, b);
 }
@@ -1030,7 +1005,7 @@
 
 	c = p;
 	q.nheap = 0;
-	q.heapsz = fs->cmax;
+	q.heapsz = 2*fs->cmax/fs->narena;
 	if((q.heap = malloc(q.heapsz*sizeof(Blk*))) == nil)
 		sysfatal("alloc queue: %r");
 	while(1){
@@ -1056,9 +1031,8 @@
 				fprint(2, "write: %r");
 				abort();
 			}
-			lrubump(b);
 		}
-		putblk(b);
+		dropblk(b);
 	}
 }
 
--- a/cache.c
+++ b/cache.c
@@ -7,117 +7,120 @@
 #include "fns.h"
 
 static void
-cachedel_lk(vlong del)
+lrudel(Blk *b)
 {
-	Bucket *bkt;
-	Blk *b, **p;
-	u32int h;
-
-	h = ihash(del);
-	bkt = &fs->cache[h % fs->cmax];
-	lock(bkt);
-	p = &bkt->b;
-	for(b = bkt->b; b != nil; b = b->hnext){
-		if(b->bp.addr == del)
-			break;
-		p = &b->hnext;
-	}
-	unlock(bkt);
-	if(b == nil)
-		return;
-	assert(checkflag(b, Bcached));
-
-	*p = b->hnext;
+	if(b == fs->chead)
+		fs->chead = b->cnext;
+	if(b == fs->ctail)
+		fs->ctail = b->cprev;
 	if(b->cnext != nil)
 		b->cnext->cprev = b->cprev;
 	if(b->cprev != nil)
 		b->cprev->cnext = b->cnext;
-	if(fs->ctail == b)
-		fs->ctail = b->cprev;
-	if(fs->chead == b)
-		fs->chead = b->cnext;
 	b->cnext = nil;
-	b->cprev = nil;
-	fs->ccount--;
+	b->cprev = nil;		
+}
 
-	clrflag(b, Bcached);
-	putblk(b);
+void
+lrutop(Blk *b)
+{
+	qlock(&fs->lrulk);
+	/*
+	 * Someone got in first and did a
+	 * cache lookup; we no longer want
+	 * to put this into the LRU, because
+	 * its now in use.
+	 */
+	assert(b->magic == Magic);
+	if(b->ref != 0){
+		qunlock(&fs->lrulk);
+		return;
+	}
+	lrudel(b);
+	if(fs->chead != nil)
+		fs->chead->cprev = b;
+	if(fs->ctail == nil)
+		fs->ctail = b;
+	b->cnext = fs->chead;
+	fs->chead = b;
+	rwakeup(&fs->lrurz);
+	qunlock(&fs->lrulk);
 }
 
 void
-cachedel(vlong del)
+lrubot(Blk *b)
 {
-	lock(&fs->lrulk);
-	cachedel_lk(del);
-	unlock(&fs->lrulk);
+	qlock(&fs->lrulk);
+	/*
+	 * Someone got in first and did a
+	 * cache lookup; we no longer want
+	 * to put this into the LRU, because
+	 * its now in use.
+	 */
+	assert(b->magic == Magic);
+	if(b->ref != 0){
+		qunlock(&fs->lrulk);
+		return;
+	}
+	lrudel(b);
+	if(fs->ctail != nil)
+		fs->ctail->cnext = b;
+	if(fs->chead == nil)
+		fs->chead = b;
+	b->cprev = fs->ctail;
+	fs->ctail = b;
+	rwakeup(&fs->lrurz);
+	qunlock(&fs->lrulk);
 }
 
-Blk*
-cacheblk(Blk *b)
+void
+cacheins(Blk *b)
 {
 	Bucket *bkt;
 	u32int h;
-	Blk *e;
 
+	assert(b->magic == Magic);
 	h = ihash(b->bp.addr);
 	bkt = &fs->cache[h % fs->cmax];
 	lock(bkt);
-	for(e = bkt->b; e != nil; e = e->hnext){
-		if(b == e)
-			goto Found;
-		assert(b->bp.addr != e->bp.addr);
+	if(checkflag(b, Bcached)){
+		unlock(bkt);
+		return;
 	}
+	setflag(b, Bcached);
 	b->hnext = bkt->b;
 	bkt->b = b;
-	if(!checkflag(b, Bcached)){
-		setflag(b, Bcached);
-		refblk(b);
-		fs->ccount++;
-	}
-Found:
 	unlock(bkt);
-	return b;
 }
 
-Blk*
-lrubump(Blk *b)
+void
+cachedel(vlong addr)
 {
-	Blk *c;
+	Bucket *bkt;
+	Blk *b, **p;
+	u32int h;
 
-	lock(&fs->lrulk);
-	if(checkflag(b, Bcached) == 0){
-		assert(b->cnext == nil);
-		assert(b->cprev == nil);
-		goto Done;
-	}
-	if(b == fs->chead)
-		fs->chead = b->cnext;
+	if(addr == -1)
+		return;
 
-	if(b == fs->ctail)
-		fs->ctail = b->cprev;
-	if(b->cnext != nil)
-		b->cnext->cprev = b->cprev;
-	if(b->cprev != nil)
-		b->cprev->cnext = b->cnext;
-	if(fs->ctail == nil)
-		fs->ctail = b;
-	if(fs->chead != nil)
-		fs->chead->cprev = b;
-	b->cnext = fs->chead;
-	b->cprev = nil;
-	fs->chead = b;
-	for(c = fs->ctail; c != b && fs->ccount >= fs->cmax; c = fs->ctail){
-		assert(c != fs->chead);
-		cachedel_lk(c->bp.addr);
+	h = ihash(addr);
+	bkt = &fs->cache[h % fs->cmax];
+	lock(bkt);
+	p = &bkt->b;
+	for(b = bkt->b; b != nil; b = b->hnext){
+		if(b->bp.addr == addr){
+			*p = b->hnext;
+			clrflag(b, Bcached);
+			b->hnext = nil;
+			break;
+		}
+		p = &b->hnext;
 	}
-
-Done:
-	unlock(&fs->lrulk);
-	return b;
+	unlock(bkt);
 }
 
 Blk*
-lookupblk(vlong off)
+cacheget(vlong off)
 {
 	Bucket *bkt;
 	u32int h;
@@ -127,14 +130,49 @@
 
 	inc64(&fs->stats.cachelook, 1);
 	bkt = &fs->cache[h % fs->cmax];
+
+	qlock(&fs->lrulk);
 	lock(bkt);
-	for(b = bkt->b; b != nil; b = b->hnext)
+	for(b = bkt->b; b != nil; b = b->hnext){
 		if(b->bp.addr == off){
 			inc64(&fs->stats.cachehit, 1);
- 			refblk(b);
+ 			holdblk(b);
+			lrudel(b);
+			b->lasthold = getcallerpc(&off);
 			break;
 		}
+	}
 	unlock(bkt);
+	qunlock(&fs->lrulk);
+
 	return b;
 }
 
+/*
+ * Pulls the block from the bottom of the LRU for reuse.
+ */
+Blk*
+cachepluck(void)
+{
+	Blk *b;
+
+	qlock(&fs->lrulk);
+	while(fs->ctail == nil)
+		rsleep(&fs->lrurz);
+
+	b = fs->ctail;
+	assert(b->magic == Magic);
+	assert(b->ref == 0);
+	cachedel(b->bp.addr);
+	lrudel(b);
+	b->flag = 0;
+	b->bp.addr = -1;
+	b->bp.hash = -1;
+	b->lasthold = 0;
+	b->lastdrop = 0;
+	b->freed = 0;
+	b->hnext = nil;
+	qunlock(&fs->lrulk);
+
+	return  holdblk(b);
+}
--- a/check.c
+++ b/check.c
@@ -81,7 +81,7 @@
 			}
 			if(badblk(fd, c, h - 1, &x, &y))
 				fail++;
-			putblk(c);
+			dropblk(c);
 		}
 		r = keycmp(&x, &y);
 		switch(r){
@@ -187,7 +187,7 @@
 	if((b = getroot(&fs->snap, &height)) != nil){
 		if(badblk(fd, b, height-1, nil, 0))
 			ok = 0;
-		putblk(b);
+		dropblk(b);
 	}
 	return ok;
 }
--- a/cons.c
+++ b/cons.c
@@ -226,14 +226,23 @@
 }
 
 static void
-showblkdump(int fd, char **ap, int)
+showblkdump(int fd, char **ap, int na)
 {
 	Bptr bp;
+	Blk *b;
 
-	bp.addr = strtoll(ap[0], nil, 16);
-	bp.hash = -1;
-	bp.gen = -1;
-	showbp(fd, bp, 0);
+	if(na == 0){
+		for(b = fs->blks; b != fs->blks+fs->cmax; b++){
+			fprint(fd, "%#p %B:\t%#lx %#llx %#llx\n", b, b->bp, b->flag, b->alloced, b->freed);
+			b->magic = Magic;
+			lrutop(b);
+		}
+	}else{
+		bp.addr = strtoll(ap[0], nil, 16);
+		bp.hash = -1;
+		bp.gen = -1;
+		showbp(fd, bp, 0);
+	}
 }
 
 static void
@@ -289,7 +298,8 @@
 	{.name="show",	.sub="snap",	.minarg=0, .maxarg=1, .fn=showsnap},
 	{.name="show",	.sub="tree",	.minarg=0, .maxarg=1, .fn=showtree},
 	{.name="show",	.sub="users",	.minarg=0, .maxarg=0, .fn=showusers},
-	{.name="show",	.sub="blk",	.minarg=1, .maxarg=1, .fn=showblkdump},
+	{.name="show",	.sub="blk",	.minarg=0, .maxarg=1, .fn=showblkdump},
+	{.name="show",	.sub="blks",	.minarg=1, .maxarg=1, .fn=showblkdump},
 	{.name="debug",	.sub=nil,	.minarg=0, .maxarg=1, .fn=setdbg},
 
 	{.name=nil, .sub=nil},
@@ -306,7 +316,7 @@
 	while(1){
 		if((n = read(fd, buf, sizeof(buf)-1)) == -1)
 			break;
-		quiesce(tid);
+		epochstart(tid);
 		buf[n] = 0;
 		nf = tokenize(buf, f, nelem(f));
 		if(nf == 0 || strlen(f[0]) == 0)
@@ -334,6 +344,6 @@
 				fprint(fd, " %s", f[i]);
 			fprint(fd, "'\n");
 		}
-		quiesce(tid);
+		epochend(tid);
 	}
 }
--- a/dat.h
+++ b/dat.h
@@ -92,10 +92,9 @@
 
 enum {
 	Bdirty	= 1 << 0,
-	Bqueued	= 1 << 1,
-	Bfinal	= 1 << 2,
-	Bfreed	= 1 << 3,
-	Bcached	= 1 << 4,
+	Bfinal	= 1 << 1,
+	Bfreed	= 1 << 2,
+	Bcached	= 1 << 3,
 };
 
 /* internal errors */
@@ -240,8 +239,7 @@
 };
 
 enum {
-	HdMagic = 0x68646d6167696373,
-	TlMagic = 0x979b929e98969c8c,
+	Magic = 0x979b929e98969c8c,
 };
 
 /*
@@ -375,8 +373,9 @@
 };
 
 struct Bfree {
-	Bptr	bp;
 	Bfree	*next;
+	Blk	*b;
+	Bptr	bp;
 };
 
 struct User {
@@ -430,18 +429,14 @@
 	int	nquiesce;
 	vlong	qgen;
 	Lock	activelk;
-	int	active[32];
+	ulong	active[32];
 	int	lastactive[32];
 	Chan	*chsync[32];
 
-	QLock	freelk;
-	Rendez	freerz;
-	Blk	*free;
+	Lock	freelk;
+	Bfree	*freep;
+	Bfree	*freehd;
 
-	Lock	dealloclk;
-	Bfree	*deallocp;
-	Bfree	*deallochd;
-
 	int	fd;
 	long	broken;
 	long	rdonly;
@@ -460,12 +455,14 @@
 	QLock	blklk[32];
 
 	/* protected by lrulk */
-	Lock	lrulk;
+	QLock	lrulk;
+	Rendez	lrurz;
 	Bucket	*cache;
+	Blk	*blks;	/* all blocks for debugging */
 	Blk	*chead;
 	Blk	*ctail;
-	int	ccount;
-	int	cmax;
+	usize	ccount;
+	usize	cmax;
 
 	Stats	stats;
 };
@@ -604,13 +601,20 @@
 	vlong	logsz;	/* for allocation log */
 	vlong	lognxt;	/* for allocation log */
 
+	/* debug */
 	uintptr	alloced;
-	uintptr	freed;	/* debug */
+	uintptr lasthold;
+	uintptr lasthold0;
+	uintptr lasthold1;
+	uintptr lastdrop;
+	uintptr uncached;
+	uintptr	freed;
 
 	Bptr	bp;
 	long	ref;
 	char	*data;
 	char	buf[Blksz];
+	vlong	magic;
 };
 
 struct Chan {
--- a/dump.c
+++ b/dump.c
@@ -266,7 +266,7 @@
 					sysfatal("failed load: %r");
 				if(recurse)
 					rshowblk(fd, c, indent + 1, 1);
-				putblk(c);
+				dropblk(c);
 			}else{
 				fprint(fd, "%.*s[%03d]|%P\n", 4*indent, spc, i, &kv);
 			}
@@ -322,7 +322,7 @@
 	b = getroot(t, &h);
 	fprint(fd, "=== [%s] %B @%d\n", name, t->bp, t->ht);
 	rshowblk(fd, b, 0, 1);
-	putblk(b);
+	dropblk(b);
 	if(t != &fs->snap)
 		closesnap(t);
 }
@@ -334,7 +334,7 @@
 
 	b = getblk(bp, GBnochk);
 	rshowblk(fd, b, 0, recurse);
-	putblk(b);
+	dropblk(b);
 }
 
 static void
--- a/fns.h
+++ b/fns.h
@@ -15,16 +15,23 @@
 Blk*	dupblk(Blk*);
 Blk*	getroot(Tree*, int*);
 Blk*	getblk(Bptr, int);
-Blk*	refblk(Blk*);
-Blk*	cacheblk(Blk*);
-Blk*	lrubump(Blk*);
+Blk*	holdblk(Blk*);
+void	dropblk(Blk*);
+
+void	lrutop(Blk*);
+void	lrubot(Blk*);
+void	cacheins(Blk*);
 void	cachedel(vlong);
-Blk*	lookupblk(vlong);
+Blk*	cacheget(vlong);
+Blk*	cachepluck(void);
+
 Arena*	getarena(vlong);
-void	putblk(Blk*);
 int	syncblk(Blk*);
 void	enqueue(Blk*);
-void	quiesce(int);
+void	epochstart(int);
+void	epochend(int);
+void	epochclean(void);
+void	freesync(void);
 void	freeblk(Tree*, Blk*);
 void	freebp(Tree*, Bptr);
 int	killblk(Tree*, Bptr);
--- a/fs.c
+++ b/fs.c
@@ -276,7 +276,7 @@
 	if((b = getblk(bp, GBraw)) == nil)
 		return -1;
 	memcpy(d, b->buf+fo, n);
-	putblk(b);
+	dropblk(b);
 	return n;
 }
 
@@ -309,7 +309,7 @@
 				return -1;
 			memcpy(b->buf, t->buf, Blksz);
 			freeblk(f->mnt->root, t);
-			putblk(t);
+			dropblk(t);
 		}else if(e != Eexist){
 			werrstr("%s", e);
 			return -1;
@@ -328,7 +328,7 @@
 
 	packbp(m->v, m->nv, &b->bp);
 	*ret = b->bp;
-	putblk(b);
+	dropblk(b);
 	return n;
 }
 
@@ -1886,7 +1886,7 @@
 
 	while(1){
 		m = chrecv(fs->wrchan, 1);
-		quiesce(wid);
+		epochstart(wid);
 		ao = (m->a == nil) ? AOnone : m->a->op;
 		switch(ao){
 		case AOnone:
@@ -1923,7 +1923,7 @@
 			freemsg(m);
 			break;
 		}
-		quiesce(wid);
+		epochend(wid);
 	}
 }
 
@@ -1934,7 +1934,7 @@
 
 	while(1){
 		m = chrecv(fs->rdchan, 1);
-		quiesce(wid);
+		epochstart(wid);
 		switch(m->type){
 		case Tflush:	rerror(m, Eimpl);	break;
 		case Tattach:	fsattach(m);	break;
@@ -1943,7 +1943,7 @@
 		case Tstat:	fsstat(m);	break;
 		case Topen:	fsopen(m);	break;
 		}
-		quiesce(wid);
+		epochend(wid);
 	}
 }
 
--- a/load.c
+++ b/load.c
@@ -50,11 +50,11 @@
 	Arena *a;
 	char *e;
 	Tree *t;
-	int i;
+	int i, k;
 
 	fs->osnap = nil;
 	fs->gotinfo = 0;
-	fs->narena = 8;
+	fs->narena = 1;
 	if((fs->fd = open(dev, ORDWR)) == -1)
 		sysfatal("open %s: %r", dev);
 	if((fs->arenas = calloc(1, sizeof(Arena))) == nil)
@@ -67,7 +67,8 @@
 		if(!fs->gotinfo){
 			if((fs->arenas = realloc(fs->arenas, fs->narena*sizeof(Arena))) == nil)
 				sysfatal("malloc: %r");
-			memset(fs->arenas+1, 0, (fs->narena-1)*sizeof(Arena));
+			for(k = 1; k < fs->narena; k++)
+				memset(&fs->arenas[k], 0, sizeof(Arena));
 			fs->gotinfo = 1;
 		}
 	}
@@ -92,6 +93,7 @@
 	fprint(2, "\tarenasz:\t%lld\n", fs->arenasz);
 	fprint(2, "\tnextqid:\t%lld\n", fs->nextqid);
 	fprint(2, "\tnextgen:\t%lld\n", fs->nextgen);
+	fprint(2, "\tblocksize:\t%lld\n", Blksz);
 	fprint(2, "\tcachesz:\t%lld MiB\n", fs->cmax*Blksz/MiB);
 	if((t = openlabel("main")) == nil)
 		sysfatal("load users: no main label");
--- a/main.c
+++ b/main.c
@@ -15,7 +15,7 @@
 char	*forceuser;
 char	*srvname = "gefs";
 char	*dev;
-int	cachesz = 512*MiB;
+vlong	cachesz = 512*MiB;
 
 vlong
 inc64(vlong *v, vlong dv)
@@ -33,15 +33,12 @@
 static void
 initfs(vlong cachesz)
 {
-	char *p, *buf, *ebuf;
-	usize sz;
-	uvlong *ck;
-	Blk *b;
+	Blk *b, *buf;
 
 	if((fs = mallocz(sizeof(Gefs), 1)) == nil)
 		sysfatal("malloc: %r");
 
-	fs->freerz.l = &fs->freelk;
+	fs->lrurz.l = &fs->lrulk;
 	fs->syncrz.l = &fs->synclk;
 	fs->noauth = noauth;
 	fs->cmax = cachesz/Blksz;
@@ -51,22 +48,16 @@
 		sysfatal("malloc: %r");
 
 	/* leave room for corruption check magic */
-	sz = 8+sizeof(Blk)+8;
-	buf = sbrk(fs->cmax * sz);
+	buf = sbrk(fs->cmax * sizeof(Blk));
 	if(buf == (void*)-1)
 		sysfatal("sbrk: %r");
-	ebuf = buf + fs->cmax*sz;
-	for(p = buf; p != ebuf; p += sz){
-		ck = (uvlong*)p;
-		*ck = HdMagic;
-
-		b = (Blk*)(p+8);
-		b->fnext = fs->free;
-		fs->free = b;
-
-		ck = (uvlong*)(b+1);
-		*ck = TlMagic;
+	for(b = buf; b != buf+fs->cmax; b++){
+		b->bp.addr = -1;
+		b->bp.hash = -1;
+		b->magic = Magic;
+		lrutop(b);
 	}
+	fs->blks = buf;
 }
 
 static void
@@ -197,10 +188,15 @@
 
 	if((s = getenv("NPROC")) != nil)
 		nproc = atoi(s);
-	if(nproc == 0)
+
+	/*
+	 * too few procs, we can't parallelize io,
+	 * too many, we suffer lock contention
+	 */
+	if(nproc < 2)
 		nproc = 2;
-	if(nproc > nelem(fs->active))
-		nproc = nelem(fs->active);
+	if(nproc > 6)
+		nproc = 6;
 	if(ream){
 		reamfs(dev);
 		exits(nil);
@@ -210,19 +206,19 @@
 
 	fs->rdchan = mkchan(32);
 	fs->wrchan = mkchan(32);
-	fs->nsyncers = nproc;
+	fs->nsyncers = 2;
 	if(fs->nsyncers > fs->narena)
 		fs->nsyncers = fs->narena;
 	for(i = 0; i < fs->nsyncers; i++)
-		fs->chsync[i] = mkchan(128);
+		fs->chsync[i] = mkchan(1024);
 	for(i = 0; i < fs->narena; i++)
-		fs->arenas[i].sync = fs->chsync[i%nproc];
+		fs->arenas[i].sync = fs->chsync[i%fs->nsyncers];
 	srvfd = postfd(srvname, "");
 	ctlfd = postfd(srvname, ".cmd");
 	launch(runtasks, -1, nil, "tasks");
 	launch(runcons, fs->nquiesce++, (void*)ctlfd, "ctl");
 	launch(runwrite, fs->nquiesce++, nil, "mutate");
-	for(i = 0; i < nproc; i++)
+	for(i = 0; i < 2; i++)
 		launch(runread, fs->nquiesce++, nil, "readio");
 	for(i = 0; i < fs->nsyncers; i++)
 		launch(runsync, -1, fs->chsync[i], "syncio");
--- a/ream.c
+++ b/ream.c
@@ -85,8 +85,7 @@
 	char *p;
 	Blk *b;
 
-	if((b = mallocz(sizeof(Blk), 1)) == nil)
-		sysfatal("ream: %r");
+	b = cachepluck();
 	addr = start+Blksz;	/* arena header */
 
 	a->head.addr = -1;
@@ -93,7 +92,7 @@
 	a->head.hash = -1;
 	a->head.gen = -1;
 
-	memset(b, 0, sizeof(Blk));
+	memset(b->buf, 0, sizeof(b->buf));
 	b->type = Tlog;
 	b->bp.addr = addr;
 	b->logsz = 32;
@@ -109,11 +108,13 @@
 	finalize(b);
 	if(syncblk(b) == -1)
 		sysfatal("ream: init log");
+	dropblk(b);
 
 	bh = b->bp.hash;
 	bo = b->bp.addr;
 
-	memset(b, 0, sizeof(Blk));
+	b = cachepluck();
+	memset(b->buf, 0, sizeof(b->buf));
 	b->type = Tarena;
 	b->bp.addr = start;
 	b->data = b->buf;
@@ -127,6 +128,7 @@
 	finalize(b);
 	if(syncblk(b) == -1)
 		sysfatal("ream: write arena: %r");
+	dropblk(b);
 }
 
 void
@@ -189,7 +191,7 @@
 	}
 	if((tb = newblk(Tleaf)) == nil)
 		sysfatal("ream: allocate root: %r");
-	refblk(tb);
+	holdblk(tb);
 	initroot(tb);
 	finalize(tb);
 	syncblk(tb);
@@ -204,7 +206,7 @@
 	 */
 	if((rb = newblk(Tleaf)) == nil)
 		sysfatal("ream: allocate snaps: %r");
-	refblk(rb);
+	holdblk(rb);
 	initsnap(rb, tb);
 	finalize(rb);
 	syncblk(rb);
@@ -212,8 +214,8 @@
 	fs->snap.bp = rb->bp;
 	fs->snap.ht = 1;
 
-	putblk(tb);
-	putblk(rb);
+	dropblk(tb);
+	dropblk(rb);
 	for(i = 0; i < fs->narena; i++){
 		a = &fs->arenas[i];
 		finalize(a->tail);
--- a/snap.c
+++ b/snap.c
@@ -98,7 +98,7 @@
 		}
 		lb->logsz = Loghashsz;
 		dl->ins = lb;
-		putblk(pb);
+		dropblk(pb);
 	}
 	p = lb->data + lb->logsz;
 	PBIT64(p, v1);	p += 8;
--- a/tree.c
+++ b/tree.c
@@ -963,7 +963,7 @@
 	if(pp->op != POmod || pp->op != POmerge)
 		return 0;
 
-	m = refblk(pp->nl);
+	m = holdblk(pp->nl);
 	spc = (m->type == Tleaf) ? Leafspc : Pivspc;
 	if(idx-1 >= 0){
 		getval(p->b, idx-1, &kl);
@@ -990,9 +990,9 @@
 Done:
 	ret = 0;
 Out:
-	putblk(m);
-	putblk(l);
-	putblk(r);
+	dropblk(m);
+	dropblk(l);
+	dropblk(r);
 	return ret;
 }
 
@@ -1081,9 +1081,9 @@
 			freeblk(t, p->b);
 		if(p->m != nil)
 			freeblk(t, p->m);
-		putblk(p->b);
-		putblk(p->nl);
-		putblk(p->nr);
+		dropblk(p->b);
+		dropblk(p->nl);
+		dropblk(p->nr);
 	}
 	free(path);
 }
@@ -1178,6 +1178,7 @@
 		PBIT16(p, o);
 	}
 	enqueue(r);
+
 	lock(&t->lk);
 	t->bp = r->bp;
 	t->dirty = 1;
@@ -1184,8 +1185,8 @@
 	unlock(&t->lk);
 
 	freeblk(t, b);
-	putblk(b);
-	putblk(r);
+	dropblk(b);
+	dropblk(r);
 	return nil;
 }
 	
@@ -1265,6 +1266,8 @@
 
 
 	assert(rb->bp.addr != 0);
+	assert(rb->bp.addr != 0);
+
 	lock(&t->lk);
 	t->ht += dh;
 	t->bp = rb->bp;
@@ -1305,11 +1308,13 @@
 
 	if((b = getroot(t, &h)) == nil)
 		return Efs;
-	if((p = calloc(h, sizeof(Blk*))) == nil)
+	if((p = calloc(h, sizeof(Blk*))) == nil){
+		dropblk(b);
 		return Enomem;
+	}
 	err = Eexist;
 	ok = 0;
-	p[0] = refblk(b);
+	p[0] = holdblk(b);
 	for(i = 1; i < h; i++){
 		if(blksearch(p[i-1], k, r, &same) == -1)
 			break;
@@ -1351,8 +1356,8 @@
 Out:
 	for(i = 0; i < h; i++)
 		if(p[i] != nil)
-			putblk(p[i]);
-	putblk(b);
+			dropblk(p[i]);
+	dropblk(b);
 	free(p);
 	return err;
 }
@@ -1432,7 +1437,7 @@
 			return nil;
 		}
 		if(p[i].b != nil)
-			putblk(p[i].b);
+			dropblk(p[i].b);
 		p[i].b = nil;
 		p[i].vi = 0;
 		p[i].bi = 0;
@@ -1498,6 +1503,6 @@
 	int i;
 
 	for(i = 0; i < s->root.ht; i++)
-		putblk(s->path[i].b);
+		dropblk(s->path[i].b);
 	free(s->path);
 }