shithub: gefs

ref: 6b02773727cc3b9c434ef1ce5bbd94dd6b706aef
dir: /blk.c/

View raw version
#include <u.h>
#include <libc.h>
#include <fcall.h>
#include <avl.h>

#include "dat.h"
#include "fns.h"

static vlong	blkalloc_lk(Arena*);
static Blk	*cacheblk(Blk*);
static Blk	*lookupblk(vlong);

Blk*
readblk(vlong bp, int flg)
{
	Blk *b;
	vlong off, rem, n;

	assert(bp != -1);
	if((b = malloc(sizeof(Blk))) == nil)
		return nil;
	off = bp;
	rem = Blksz;
	while(rem != 0){
		n = pread(fs->fd, b->buf, rem, off);
		if(n <= 0){
			free(b);
			return nil;
		}
		off += n;
		rem -= n;
	}
	memset(&b->RWLock, 0, sizeof(RWLock));
	b->type = (flg&GBraw) ? Traw : GBIT16(b->buf+0);
	b->off = bp;
	b->cnext = nil;
	b->cprev = nil;
	b->hnext = nil;
	b->data = b->buf + 10;
	switch(b->type){
	default:
		if(flg&GBraw)
			break;
		fprint(2, "invalid block @%llx\n", bp);
		abort();
		break;
	case Tarena:
	case Traw:
	case Tsuper:
	case Tlog:
		break;
	case Tpivot:
		b->nval = GBIT16(b->buf+2);
		b->valsz = GBIT16(b->buf+4);
		b->nbuf = GBIT16(b->buf+6);
		b->bufsz = GBIT16(b->buf+8);
		break;
	case Tleaf:
		b->nval = GBIT16(b->buf+2);
		b->valsz = GBIT16(b->buf+4);
		break;
	}
	return b;
}

static Arena*
pickarena(vlong hint)
{
	long n;

	n = -1; /* shut up, ken */
	if(hint > 0 || hint < fs->narena)
		n = hint / fs->arenasz;
	else if(hint == -1)
		n = ainc(&fs->nextarena) % fs->narena;
	else
		abort();
	return &fs->arenas[n];
}

static Arena*
getarena(vlong b)
{
	int i;

	i = b / fs->arenasz;
	if(i < 0 || i >= fs->narena){
		werrstr("out of range block %lld", b);
		abort();
//		return nil;
	}
	return &fs->arenas[i];
}

static int
freerange(Avltree *t, vlong off, vlong len)
{
	Arange *r, *s, q;

	assert(len % Blksz == 0);

	q.off = off;
	q.len = len;
	r = (Arange*)avllookup(t, &q.Avl, -1);
	if(r == nil){
		if((r = calloc(1, sizeof(Arange))) == nil)
			return -1;
		r->off = off;
		r->len = len;
		avlinsert(t, r);
	}else if(off+len == r->off){
		r->len += len;
		r->off -= len;
	}else if(off == r->off + r->len){
		r->len += len;
	}else{
		r = (Arange*)avlnext(r);
		if(r != nil && off+len == r->off){
			r->off -= len;
			r->len += len;
		} else {
			if((r = calloc(1, sizeof(Arange))) == nil)
				return -1;
			r->off = off;
			r->len = len;
			avlinsert(t, r);
		}
	}

	/* merge with previous */
	s = (Arange*)avlprev(r);
	if(s != nil && s->off + s->len == r->off){
		s->off += r->len;
		avldelete(t, r);
	}
	/* merge with next */
	s = (Arange*)avlnext(r);
	if(s != nil && r->off + r->len == s->off){
		r->off += r->len;
		avldelete(t, s);
	}
	return 0;
}

int
grabrange(Avltree *t, vlong off, vlong len)
{
	Arange *r, *s, q;

	assert(len % Blksz == 0);
	q.off = off;
	q.len = len;
	r = (Arange*)avllookup(t, &q.Avl, -1);
	if(r == nil || off + len > r->off + r->len){
		fprint(2, "no blocks: %llx+%llx in:\n", off, len);
		for(r = (Arange*)avlmin(t); r != nil; r = (Arange*)avlnext(r))
			fprint(2, "\t%llx+%llx\n", r->off, r->len);
		abort();
		return -1;
	}
	if(off == r->off)
		r->off += len;
	else if(off == r->off + r->len)
		r->len -= len;
	else if(off + len > r->off && off > r->off + r->len){
		if((s = malloc(sizeof(Arange))) == nil)
			return -1;
		s->off = off;
		s->len = r->len - (off - r->off) - len;;
		r->len = off - r->off;
		avlinsert(t, s);
	}
	if(r->len == 0){
		avldelete(t, r);
		free(r);
	}
	return 0;
}

int
synclog(Blk *b, vlong link, vlong sz)
{

	if(sz == -1)
		sz = b->logsz;
	PBIT64(b->data+8, link);
	PBIT64(b->data+16, sz);
	finalize(b);
fprint(2, "synclog: @%llx\n", b->off);
showfree("synclog");
	return pwrite(fs->fd, b->buf, Blksz, b->off);
}

Blk*
logappend(Arena *a, Blk *lb, vlong off, vlong len, int op, vlong graft)
{
	Blk *pb;
	vlong o, n;
	char *p;

	assert(off % Blksz == 0);
	if(lb == nil || lb->logsz+16+24 > Blkspc){
		pb = lb;
		if((o = blkalloc_lk(a)) == -1)
			return nil;
		if((lb = mallocz(sizeof(Blk), 1)) == nil)
			return nil;
		lb->data = lb->buf + Hdrsz;
		lb->flag |= Bdirty;
		lb->type = Tlog;
		lb->off = o;
		lb->logsz = 0;
		if(synclog(lb, graft, 0) == -1)
			return nil;

		a->logtl = lb;
		if(pb != nil)
			if(synclog(pb, lb->off, -1) == -1)
				return nil;
	}
	n = 8;
	p = lb->data + 24 + lb->logsz;
	if(len == Blksz && op == LgAlloc)
		op = LgAlloc1;
	off |= op;
	PBIT64(p, off);
	if(op == LgAlloc || op == LgFree){
		PBIT64(p+8, len);
		n += 8;
	}
	lb->logsz += n;
	return lb;
}

/*
 * Logs an allocation. Must be called
 * with arena lock held. Duplicates some/c
 * of the work in allocblk to prevent
 * recursion.
 */
int
logalloc(Arena *a, vlong off, int op)
{
	Blk *b;

	if((b = logappend(a, a->logtl, off, Blksz, op, -1)) == nil)
		return -1;
	if(a->logtl == nil){
		a->log = b->off;
		a->logtl = b;
	}
	return 0;
}

int
loadlog(Arena *a)
{
	Blk *b;
	vlong bp, ent, off, len;
	uvlong bh;
	char *p, *d;
	int op, i, n;

	bp = a->log;
	while(bp != -1){
		dprint("log block: %llx\n", bp);
		if((b = readblk(bp, 0)) == nil)
			return -1;
		p = b->data;
		bh = GBIT64(p + 0);
		bp = GBIT64(p + 8);
		b->logsz = GBIT64(p + 16);
		/* the hash covers the log and offset */
		if(bh != siphash(p+8, Blkspc-8)){
			werrstr("corrupt log");
			return -1;
		}
		for(i = 0; i < b->logsz; i += n){
			d = p + i + 24;
			ent = GBIT64(d);
			op = ent & 0xff;
			off = ent & ~0xff;
			switch(op){
			case LgFlush:
				dprint("log@%d: flush: %llx\n", i, off>>8);
				n = 8;
				lock(&fs->genlk);
				fs->gen = off >> 8;
				unlock(&fs->genlk);
				break;
			case LgAlloc1:
				n = 8;
				dprint("log@%d alloc1: %llx\n", i, off);
				if(grabrange(a->free, off & ~0xff, Blksz) == -1)
					return -1;
				break;
			case LgAlloc:
				n = 16;
				len = GBIT64(d+8);
				dprint("log@%d alloc: %llx+%llx\n", i, off, len);
				if(grabrange(a->free, off & ~0xff, len) == -1)
					return -1;
				break;
			case LgFree:
				n = 16;
				len = GBIT64(d+8);
				dprint("log@%d free: %llx+%llx\n", i, off, len);
				if(freerange(a->free, off & ~0xff, len) == -1)
					return -1;
				break;
			case LgRef1:
			case LgUnref1:
				n = 8;
				fprint(2, "unimplemented ref op at log@%d: log op %d\n", i, op);
				break;
			default:
				n = 0;
				dprint("log@%d: log op %d\n", i, op);
				abort();
				break;
			}
		}
	}
	return 0;
}

int
compresslog(Arena *a)
{
	Arange *r;
	vlong *log, *p, bp, hd, hh, graft;
	int i, n, sz;
	Blk *pb, *ab, *b;

showfree("precompress");
fprint(2, "compress start\n");
	/*
	 * Sync the current log to disk. While
	 * compressing the log, nothing else is
	 * using this arena, so any allocs come
	 * from the log compression.
	 *
	 * A bit of copy paste from newblk,
	 * because otherwise we have bootstrap
	 * issues.
	 *
	 * Set up the head of the new log as
	 * an empty block.
	 */
	if((bp = blkalloc_lk(a)) == -1)
		return -1;
	if((b = mallocz(sizeof(Blk), 1)) == nil)
		return -1;
	b->type = Tlog;
	b->flag = Bdirty;
	b->off = bp;
	b->ref = 1;
	b->data = b->buf + Hdrsz;
checkfs();
	if(synclog(b, -1, 0) == -1)
		return -1;
checkfs();

	/*
	 * When reaming or loading, we may not
	 * have set up the log.
	 */
checkfs();
	if(a->logtl != nil)
		synclog(a->logtl, b->off, -1);
checkfs();
	graft = b->off;
	a->logtl = b;

	/*
	 * Prepare what we're writing back.
	 * Arenas must be sized so that we can
	 * keep the merged log in memory for
	 * a rewrite.
	 */
	n = 0;
	sz = 512;
checkfs();
	if((log = malloc(2*sz*sizeof(vlong))) == nil)
		return -1;
checkfs();
	for(r = (Arange*)avlmin(a->free); r != nil; r = (Arange*)avlnext(r)){
		if(n == sz){
			sz *= 2;
			if((p = realloc(log, 2*sz*sizeof(vlong))) == nil){
				free(log);
				return -1;
			}
			log = p;
		}
		log[2*n+0] = r->off;
		log[2*n+1] = r->len;
		n++;
	}
	if((b = newblk(Tlog)) == nil){
		free(log);
		return -1;
	}
checkfs();
	pb = b;
	hd = b->off;
	hh = -1;
	PBIT64(b->data +  8, graft);	/* link */
	PBIT64(b->data + 16, 0ULL);	/* count */
checkfs();

	for(i = 0; i < n; i++){
		if((b = logappend(a, b, log[2*i], log[2*i+1], LgFree, graft)) == nil)
			return -1;
checkfs();
		if(b != pb){
checkfs();
			synclog(pb, b->off, -1);
checkfs();
			if(pb->off == hd)
				hh = blkhash(pb);
			b = pb;
		}
	}
	free(log);
checkfs();
	if(synclog(b, graft, -1) == -1)
		return -1;
checkfs();
	if(pb->off == hd)
		hh = blkhash(pb);
checkfs();

	a->log = hd;
	a->logh = hh;
	ab = a->b;
	PBIT64(ab->data + 0, hd);
	PBIT64(ab->data + 8, hh);
checkfs();
	pwrite(fs->fd, ab->buf, Blksz, ab->off);
checkfs();
fprint(2, "compress done\n");
	return 0;
}
/*
 * Allocate from an arena, with lock
 * held. May be called recursively, to
 * alloc space for the alloc log.
 */
static vlong
blkalloc_lk(Arena *a)
{
	Avltree *t;
	Arange *r;
	vlong b;

	t = a->free;
	r = (Arange*)t->root;
	if(r == nil){
		unlock(a);
		return -1;
	}

	/*
	 * A bit of sleight of hand here:
	 * while we're changing the sorting
	 * key, but we know it won't change
	 * the sort order because the tree
	 * covers disjoint ranges
	 */
	b = r->off;
	r->len -= Blksz;
	r->off += Blksz;
	if(r->len == 0){
		avldelete(t, r);
		free(r);
	}
	return b;
}

int
blkrelease(vlong b)
{
	Arena *a;
	int r;

	r = -1;
	a = getarena(b);
	lock(a);
	if(freerange(a->free, b, Blksz) == -1)
		goto out;
	if(logalloc(a, b, LgFree) == -1)
		goto out;
	r = 0;
out:
	unlock(a);
	return r;
}

vlong
blkalloc(vlong hint)
{
	Arena *a;
	vlong b;
	int tries;

	tries = 0;
again:
	a = pickarena(hint);
	if(a == nil || tries == fs->narena){
		werrstr("no empty arenas");
		return -1;
	}
	lock(a);
	/*
	 * TODO: there's an extreme edge case
	 * here.
	 *
	 * If the file system has room to alloc
	 * a data block but no log block, then
	 * we end up with it in a stuck state.
	 * The fix is to reserve alloc blocks,
	 * so that we're guaranteed to be able
	 * to log an alloc if the disk is working
	 * correctly.
	 */
	tries++;
	if((b = blkalloc_lk(a)) == -1){
		unlock(a);
		goto again;
	}
	if(logalloc(a, b, LgAlloc) == -1){
		unlock(a);
		return -1;
	}
	unlock(a);
	return b;
}

Blk*
newblk(int t)
{
	Blk *b;
	vlong bp;

	if((bp = blkalloc(-1)) == -1)
		return nil;
	if((b = mallocz(sizeof(Blk), 1)) == nil)
		return nil;
	b->type = t;
	b->flag = Bdirty;
	b->off = bp;
	b->ref = 1;
	b->data = b->buf + Hdrsz;
	return cacheblk(b);
}

static Blk*
lookupblk(vlong off)
{
	Bucket *bkt;
	u32int h;
	Blk *b;

	h = ihash(off);

	bkt = &fs->cache[h % fs->cmax];
	lock(bkt);
	for(b = bkt->b; b != nil; b = b->hnext)
		if(b->off == off)
			break;
	if(b != nil)
		pinblk(b);
	unlock(bkt);
	return b;
}

static Blk*
cacheblk(Blk *b)
{
	Bucket *bkt;
	Blk *e, *c;
	u32int h;

	/* FIXME: better hash. */
	assert(b->off != 0);
	h = ihash(b->off);
//	dprint("cache %lld (h=%xm, bkt=%d) => %p\n", b->off, h%fs->cmax, h, b);
	ainc(&b->ref);
	bkt = &fs->cache[h % fs->cmax];
	lock(bkt);
	for(e = bkt->b; e != nil; e = e->hnext){
		if(b == e)
			goto found;
		assert(b->off != e->off);
	}
	bkt->b = b;
found:
	ainc(&b->ref);
	unlock(bkt);

	lock(&fs->lrulk);
	if(b == fs->chead)
		goto Cached;
	if(b == fs->ctail)
		fs->ctail = b->cprev;

	if(b->cnext != nil)
		b->cnext->cprev = b->cprev;
	if(b->cprev != nil)
		b->cprev->cnext = b->cnext;
	if(fs->ctail == nil)
		fs->ctail = b;
	if(fs->chead != nil)
		fs->chead->cprev = b;
	if(fs->ctail == nil)
		fs->ctail = b;
	b->cnext = fs->chead;
	b->cprev = nil;
	fs->chead = b;
	if((b->flag&Bcache) == 0){
		b->flag |= Bcache;
		fs->ccount++;
		ainc(&b->ref);
	}
	c=0;
	USED(c);
/*
	for(c = fs->ctail; c != nil && fs->ccount >= fs->cmax; c = fs->ctail){
		fs->ctail = c->cprev;
		fs->ccount--; 
		putblk(c);
	}
*/
Cached:
	unlock(&fs->lrulk);
	return b;
}

static void
cachedel(Blk *del)
{
	Bucket *bkt;
	Blk *b, **p;
	u32int h;

	/* FIXME: better hash. */
	h = ihash(del->off);

	bkt = &fs->cache[h % fs->cmax];
	lock(bkt);
	p = &bkt->b;
	for(b = bkt->b; b != nil; b = b->hnext){
		if(b == del){
			*p = del->hnext;
			break;
		}
		p = &b->hnext;
	}
	unlock(bkt);

	lock(&fs->lrulk);
	if(b->cnext != nil)
		b->cnext->cprev = b->cprev;
	if(b->cprev != nil)
		b->cprev->cnext = b->cnext;
	if(fs->ctail == b)
		fs->ctail = b->cprev;
	if(fs->chead == nil)
		fs->chead = b;
	unlock(&fs->lrulk);
}

void
enqueue(Blk *b)
{
	if(pwrite(fs->fd, b->buf, Blksz, b->off) == -1){
		ainc(&fs->broken);
		fprint(2, "write: %r");
		return;
	}
	wlock(b);
	b->flag &= ~(Bqueued|Bdirty|Bfinal);
	wunlock(b);

}

void
fillsuper(Blk *b)
{
	char *p;

	assert(b->type == Tsuper);
	p = b->data;
	memcpy(p +  0, "gefs0001", 8);
	PBIT32(p +  8, 0); /* dirty */
	PBIT32(p + 12, Blksz);
	PBIT32(p + 16, Bufspc);
	PBIT32(p + 20, Hdrsz);
	PBIT32(p + 24, fs->height);
	PBIT64(p + 32, fs->rootb);
	PBIT64(p + 40, fs->rooth);
	PBIT32(p + 48, fs->narena);
	PBIT64(p + 56, fs->arenasz);
	PBIT64(p + 64, fs->gen);
	PBIT64(p + 72, fs->nextqid);
}

void
finalize(Blk *b)
{
	vlong h;

//	assert((b->flag & Bfinal) == 0);
	b->flag |= Bfinal;
	PBIT16(b->buf, b->type);
	switch(b->type){
	default:
	case Tnone:
		abort();
		break;
	case Tpivot:
		PBIT16(b->buf+2, b->nval);
		PBIT16(b->buf+4, b->valsz);
		PBIT16(b->buf+6, b->nbuf);
		PBIT16(b->buf+8, b->bufsz);
		break;
	case Tleaf:
		PBIT16(b->buf+2, b->nval);
		PBIT16(b->buf+4, b->valsz);
		break;
	case Tlog:
		h = siphash(b->data + 8, Blkspc-8);
		PBIT64(b->data, h);
	case Tsuper:
	case Tarena:
	case Traw:
		break;
	}
}

Blk*
getblk(vlong bp, uvlong bh)
{
	Blk *b;

	if((b = lookupblk(bp)) == nil){
		if((b = readblk(bp, 0)) == nil)
			return nil;
		if(siphash(b->buf, Blksz) != bh){
			werrstr("corrupt block %llx", bp);
			return nil;
		}
	}
	return cacheblk(b);
}

Blk*
dupblk(vlong bp, uvlong bh)
{
	USED(bp, bh);
	return nil;
}

Blk*
pinblk(Blk *b)
{
	ainc(&b->ref);
	return b;
}

int
refblk(Blk *b)
{
	Arena *a;
	int r;

	a = getarena(b->off);
	lock(a);
	r = logalloc(a, b->off, LgRef1);
	unlock(a);
	return r;
}

int
unrefblk(Blk *b)
{
	Arena *a;
	int r;

	a = getarena(b->off);
	lock(a);
	r = logalloc(a, b->off, LgUnref1);
	unlock(a);
	return r;
}

ushort
blkfill(Blk *b)
{
	switch(b->type){
	case Tpivot:
		return 2*b->nbuf + b->bufsz +  2*b->nval + b->valsz;
	case Tleaf:
		return 2*b->nval + b->valsz;
	default:
		fprint(2, "invalid block @%lld\n", b->off);
		abort();
	}
	return 0; // shut up kencc
}

void
putblk(Blk *b)
{
	if(b == nil)
		return;
	if((b->flag & (Bdirty|Bqueued)) == Bdirty)
		enqueue(b);
	if(adec(&b->ref) == 0){
		cachedel(b);
		free(b);
	}
}

void
freeblk(Blk *b)
{
	Arena *a;

	assert(b->ref == 1 && b->flag & (Bdirty|Bqueued) == Bdirty);
	a = getarena(b->off);
	lock(a);
	/*
	 * TODO: what to do if we fail to log a free here??
	 * This is already an error path!
	 */
	logalloc(a, b->off, LgRef1);
	unlock(a);
	free(b);
}

int
sync(void)
{
	int i, r;
	Blk *b;

	dprint("syncing\n");
	r = 0;
	for(i = 0; i < fs->narena; i++)
		if(synclog(fs->arenas[i].logtl, -1, -1) == -1)
			r = -1;
	for(b = fs->chead; b != nil; b = b->cnext){
//		dprint("sync %p\n", b);
		if(!(b->flag & Bdirty))
			continue;
		if(pwrite(fs->fd, b->buf, Blksz, b->off) == -1)
			r = -1;
	}
	return r;
}