ref: 5ec83af8f7363c4ad8587a5aa492996dbaf7ce23
parent: e02c31e598543f835be60a8fb47c4e20347bb5ef
author: Ori Bernstein <[email protected]>
date: Sun Oct 3 12:17:09 EDT 2021
bptr: move to 24 byte block pointers with generations
--- a/blk.c
+++ b/blk.c
@@ -41,7 +41,9 @@
}
memset(&b->RWLock, 0, sizeof(RWLock));
b->type = (flg&GBraw) ? Traw : GBIT16(b->buf+0);
- b->off = bp;
+ b->bp.addr = bp;
+ b->bp.hash = -1;
+ b->bp.gen = -1;
b->cnext = nil;
b->cprev = nil;
b->hnext = nil;
@@ -189,7 +191,7 @@
lb->data = lb->buf + Hdrsz;
lb->flag |= Bdirty;
lb->type = Tlog;
- lb->off = o;
+ lb->bp.addr = o;
lb->logsz = Loghdsz;
p = lb->data + lb->logsz;
PBIT64(p + 0, (uvlong)LogEnd);
@@ -202,7 +204,7 @@
a->logtl = lb;
if(pb != nil){
p = pb->data + pb->logsz;
- PBIT64(p + 0, lb->off|LogChain);
+ PBIT64(p + 0, lb->bp.addr|LogChain);
finalize(pb);
if(syncblk(pb) == -1)
return nil;
@@ -240,7 +242,7 @@
if((b = logappend(a, a->logtl, off, Blksz, op)) == nil)
return -1;
if(a->log == -1)
- a->log = b->off;
+ a->log = b->bp.addr;
if(b != a->logtl)
a->logtl = b;
return 0;
@@ -292,9 +294,9 @@
case LogFlush:
dprint("log@%d: flush: %llx\n", i, off>>8);
- lock(&fs->genlk);
- fs->gen = off >> 8;
- unlock(&fs->genlk);
+ lock(&fs->root.lk);
+ fs->root.bp.gen = off >> 8;
+ unlock(&fs->root.lk);
break;
case LogAlloc:
case LogAlloc1:
@@ -349,7 +351,7 @@
return -1;
b->type = Tlog;
b->flag = Bdirty;
- b->off = bp;
+ b->bp.addr = bp;
b->ref = 1;
b->data = b->buf + Hdrsz;
b->logsz = Loghdsz;
@@ -361,7 +363,7 @@
return -1;
}
- graft = b->off;
+ graft = b->bp.addr;
if(a->logtl != nil){
finalize(a->logtl);
if(syncblk(a->logtl) == -1){
@@ -411,7 +413,7 @@
return -1;
oldhd = a->log;
- a->log = hd->off;
+ a->log = hd->bp.addr;
a->logh = blkhash(hd);
ab = a->b;
PBIT64(ab->data + 0, a->log);
@@ -554,7 +556,9 @@
return nil;
b->type = t;
b->flag = Bdirty;
- b->off = bp;
+ b->bp.addr = bp;
+ b->bp.hash = -1;
+ b->bp.gen = fs->nextgen;
b->ref = 1;
b->data = b->buf + Hdrsz;
return cacheblk(b);
@@ -572,7 +576,7 @@
bkt = &fs->cache[h % fs->cmax];
lock(bkt);
for(b = bkt->b; b != nil; b = b->hnext)
- if(b->off == off)
+ if(b->bp.addr == off)
break;
if(b != nil)
pinblk(b);
@@ -588,8 +592,8 @@
u32int h;
/* FIXME: better hash. */
- assert(b->off != 0);
- h = ihash(b->off);
+ assert(b->bp.addr != 0);
+ h = ihash(b->bp.addr);
ainc(&b->ref);
bkt = &fs->cache[h % fs->cmax];
lock(bkt);
@@ -596,7 +600,7 @@
for(e = bkt->b; e != nil; e = e->hnext){
if(b == e)
goto found;
- assert(b->off != e->off);
+ assert(b->bp.addr != e->bp.addr);
}
bkt->b = b;
found:
@@ -655,7 +659,7 @@
lock(bkt);
p = &bkt->b;
for(b = bkt->b; b != nil; b = b->hnext){
- if(b->off == del){
+ if(b->bp.addr == del){
*p = b->hnext;
break;
}
@@ -684,7 +688,7 @@
wlock(b);
b->flag &= ~(Bqueued|Bdirty);
wunlock(b);
- return pwrite(fs->fd, b->buf, Blksz, b->off);
+ return pwrite(fs->fd, b->buf, Blksz, b->bp.addr);
}
@@ -700,7 +704,7 @@
}
}
-void
+char*
fillsuper(Blk *b)
{
char *p;
@@ -710,18 +714,19 @@
wlock(b);
b->flag |= Bdirty;
wunlock(b);
- memcpy(p + 0, "gefs0001", 8);
- PBIT32(p + 8, 0); /* dirty */
- PBIT32(p + 12, Blksz);
- PBIT32(p + 16, Bufspc);
- PBIT32(p + 20, Hdrsz);
- PBIT32(p + 24, fs->root.ht);
- PBIT64(p + 32, fs->root.bp.addr);
- PBIT64(p + 40, fs->root.bp.hash);
- PBIT32(p + 48, fs->narena);
- PBIT64(p + 56, fs->arenasz);
- PBIT64(p + 64, fs->gen);
- PBIT64(p + 72, fs->nextqid);
+ memcpy(p, "gefs0001", 8); p += 8;
+ PBIT32(p, 0); p += 4; /* dirty */
+ PBIT32(p, Blksz); p += 4;
+ PBIT32(p, Bufspc); p += 4;
+ PBIT32(p, Hdrsz); p += 4;
+ PBIT32(p, fs->root.ht); p += 4;
+ PBIT64(p, fs->root.bp.addr); p += 8;
+ PBIT64(p, fs->root.bp.hash); p += 8;
+ PBIT64(p, fs->root.bp.gen); p += 8;
+ PBIT32(p, fs->narena); p += 4;
+ PBIT64(p, fs->arenasz); p += 8;
+ PBIT64(p, fs->nextqid); p += 8;
+ return p;
}
void
@@ -743,17 +748,21 @@
PBIT16(b->buf+4, b->valsz);
PBIT16(b->buf+6, b->nbuf);
PBIT16(b->buf+8, b->bufsz);
+ b->bp.hash = blkhash(b);
break;
case Tleaf:
PBIT16(b->buf+2, b->nval);
PBIT16(b->buf+4, b->valsz);
+ b->bp.hash = blkhash(b);
break;
case Tlog:
h = siphash(b->data + 8, Blkspc-8);
PBIT64(b->data, h);
+ case Traw:
+ b->bp.hash = blkhash(b);
+ break;
case Tsuper:
case Tarena:
- case Traw:
break;
}
}
@@ -770,8 +779,10 @@
werrstr("corrupt block %B: %llx != %llx", bp, blkhash(b), bp.hash);
return nil;
}
+ b->bp.hash = bp.hash;
+ b->bp.gen = bp.gen;
}
- assert(b->off == bp.addr);
+ assert(b->bp.addr == bp.addr);
return cacheblk(b);
}
@@ -798,7 +809,7 @@
case Tleaf:
return 2*b->nval + b->valsz;
default:
- fprint(2, "invalid block @%lld\n", b->off);
+ fprint(2, "invalid block @%lld\n", b->bp.addr);
abort();
}
return 0; // shut up kencc
@@ -811,7 +822,7 @@
return;
if(adec(&b->ref) == 0){
assert((b->flag & Bqueued) || !(b->flag & Bdirty));
- cachedel(b->off);
+ cachedel(b->bp.addr);
free(b);
}
}
@@ -822,10 +833,10 @@
Arena *a;
assert(b->ref == 1 && b->flag & (Bdirty|Bqueued) == Bdirty);
- a = getarena(b->off);
+ a = getarena(b->bp.addr);
lock(a);
- logop(a, b->off, LogFree);
- blkdealloc(b->off);
+ logop(a, b->bp.addr, LogFree);
+ blkdealloc(b->bp.addr);
unlock(a);
free(b);
}
--- a/check.c
+++ b/check.c
@@ -208,7 +208,7 @@
int i;
print("path:\n");
-#define A(b) (b ? b->off : -1)
+#define A(b) (b ? b->bp.addr : -1)
for(i = 0; i < np; i++){
print("\t[%d] ==> b(%p)=%llx, n(%p)=%llx, nl(%p)=%llx, nr(%p)=%llx idx=%d\n",
i, p[i].b, A(p[i].b), p[i].n, A(p[i].n), p[i].nl, A(p[i].nl), p[i].nr, A(p[i].nr), p[i].idx);
--- a/dat.h
+++ b/dat.h
@@ -16,7 +16,6 @@
typedef struct Bucket Bucket;
typedef struct Chan Chan;
typedef struct Tree Tree;
-typedef struct Bptr Bptr;
enum {
KiB = 1024ULL,
@@ -43,8 +42,9 @@
Loghdsz = 8, /* log hash */
Keymax = 128, /* key data limit */
Inlmax = 256, /* inline data limit */
- Ptrsz = 18, /* off, hash, fill */
- Offsz = 17, /* type, qid, off */
+ Ptrsz = 24, /* off, hash, gen */
+ Fillsz = 2, /* block fill count */
+ Offksz = 17, /* type, qid, off */
Kvmax = Keymax + Inlmax, /* Key and value */
Kpmax = Keymax + Ptrsz, /* Key and pointer */
@@ -237,6 +237,7 @@
struct Bptr {
vlong addr;
vlong hash;
+ vlong gen;
};
struct Tree {
@@ -264,13 +265,11 @@
int fd;
long broken;
- /* protected by rootlk */
Tree root;
- Lock genlk;
- vlong gen;
Lock qidlk;
vlong nextqid;
+ vlong nextgen; /* unlocked: only touched by mutator thread */
Arena *arenas;
int narena;
@@ -433,8 +432,8 @@
vlong logsz; /* for allocation log */
vlong lognxt; /* for allocation log */
- vlong off; /* -1 for unallocated */
- long ref; /* TODO: move out */
+ Bptr bp;
+ long ref;
char *data;
char buf[Blksz];
};
--- a/fns.h
+++ b/fns.h
@@ -24,7 +24,7 @@
uvlong blkhash(Blk*);
u32int ihash(vlong);
void finalize(Blk*);
-void fillsuper(Blk*);
+char* fillsuper(Blk*);
int snapshot(void);
uvlong siphash(void*, usize);
void reamfs(char*);
@@ -79,6 +79,9 @@
int kv2statbuf(Kvp*, char*, int);
int kv2dir(Kvp*, Dir*);
int kv2qid(Kvp*, Qid*);
+
+char *packbp(char*, Bptr*);
+Bptr unpackbp(char*);
/* scratch */
void setmsg(Blk *, int, Msg *);
--- a/fs.c
+++ b/fs.c
@@ -824,8 +824,7 @@
return -1;
}
fprint(2, "\treadb: key=%K, val=%P\n", &k, &kv);
- bp.addr = GBIT64(kv.v+0);
- bp.hash = GBIT64(kv.v+8);
+ bp = unpackbp(kv.v);
putblk(b);
if((b = getblk(bp, GBraw)) == nil)
@@ -921,7 +920,6 @@
writeb(Fid *f, Msg *m, char *s, vlong o, vlong n, vlong sz)
{
vlong fb, fo;
- uvlong bh;
Bptr bp;
Blk *b, *t;
Kvp kv;
@@ -938,13 +936,12 @@
if(b == nil)
return -1;
if(fb < sz && (fo != 0 || n != Blksz)){
- fprint(2, "\tappending to block %llx\n", b->off);
+ fprint(2, "\tappending to block %B\n", b->bp);
if(fslookup(f, m, &kv, &t, 0) != nil){
putblk(b);
return -1;
}
- bp.addr = GBIT64(kv.v+0);
- bp.hash = GBIT64(kv.v+8);
+ bp = unpackbp(kv.v);
putblk(t);
if((t = getblk(bp, GBraw)) == nil){
@@ -959,9 +956,9 @@
memcpy(b->buf+fo, s, n);
enqueue(b);
- bh = blkhash(b);
- PBIT64(m->v+0, b->off);
- PBIT64(m->v+8, bh);
+ bp.gen = fs->nextgen;
+ assert(b->flag & Bfinal);
+ packbp(m->v, &b->bp);
putblk(b);
checkfs();
poolcheck(mainmem);
@@ -971,7 +968,7 @@
void
fswrite(Fmsg *m)
{
- char sbuf[8], offbuf[4][Ptrsz+Offsz], *p;
+ char sbuf[8], offbuf[4][Ptrsz+Offksz], *p;
vlong n, o, c;
Msg kv[4];
Fcall r;
@@ -995,8 +992,8 @@
for(i = 0; i < nelem(kv)-1 && c != 0; i++){
kv[i].op = Oinsert;
kv[i].k = offbuf[i];
- kv[i].nk = Offsz;
- kv[i].v = offbuf[i]+Offsz;
+ kv[i].nk = Offksz;
+ kv[i].v = offbuf[i]+Offksz;
kv[i].nv = 16;
n = writeb(f, &kv[i], p, o, c, f->dent->length);
if(n == -1){
--- a/load.c
+++ b/load.c
@@ -39,6 +39,7 @@
Blk *b;
Dir *d;
int i, dirty;
+ int blksz, bufspc, hdrsz;
if((fs->fd = open(dev, ORDWR)) == -1)
sysfatal("open %s: %r", dev);
@@ -52,31 +53,29 @@
sysfatal("read superblock: %r");
if(b->type != Tsuper)
sysfatal("corrupt superblock: bad type");
- p = b->data;
- if(memcmp(p, "gefs0001", 8) != 0)
+ if(memcmp(b->data, "gefs0001", 8) != 0)
sysfatal("corrupt superblock: bad magic");
- dirty = GBIT32(p + 8);
- if(GBIT32(p + 12) != Blksz)
- sysfatal("fs uses different block size");
- if(GBIT32(p + 16) != Bufspc)
- sysfatal("fs uses different buffer size");
- if(GBIT32(p + 20) != Hdrsz)
- sysfatal("fs uses different buffer size");
- fs->root.ht = GBIT32(p + 24);
- fs->root.bp.addr = GBIT64(p + 32);
- fs->root.bp.hash = GBIT64(p + 40);
- fs->narena = GBIT32(p + 48);
- fs->arenasz = GBIT64(p + 56);
- fs->arenasz = GBIT64(p + 56);
- fs->gen = GBIT64(p + 64);
- fs->nextqid = GBIT64(p + 72);
+ p = b->data + 8;
+
+ dirty = GBIT32(p); p += 4; /* dirty */
+ blksz = GBIT32(p); p += 4;
+ bufspc = GBIT32(p); p += 4;
+ hdrsz = GBIT32(p); p += 4;
+ fs->root.ht = GBIT32(p); p += 4;
+ fs->root.bp.addr = GBIT64(p); p += 8;
+ fs->root.bp.hash = GBIT64(p); p += 8;
+ fs->root.bp.gen = GBIT64(p); p += 8;
+ fs->narena = GBIT32(p); p += 4;
+ fs->arenasz = GBIT64(p); p += 8;
+ fs->nextqid = GBIT64(p); p += 8;
fs->super = b;
+ fs->nextgen = fs->root.bp.gen+1;
+
fprint(2, "load: %8s\n", p);
fprint(2, "\theight:\t%d\n", fs->root.ht);
- fprint(2, "\trootb:\t%B\n", fs->root.bp);
+ fprint(2, "\troot:\t%B\n", fs->root.bp);
fprint(2, "\tarenas:\t%d\n", fs->narena);
fprint(2, "\tarenasz:\t%lld\n", fs->arenasz);
- fprint(2, "\trootgen:\t%lld\n", fs->gen);
fprint(2, "\tnextqid:\t%lld\n", fs->nextqid);
if((fs->arenas = calloc(fs->narena, sizeof(Arena))) == nil)
sysfatal("malloc: %r");
@@ -83,6 +82,12 @@
for(i = 0; i < fs->narena; i++)
if((loadarena(&fs->arenas[i], i*fs->arenasz)) == -1)
sysfatal("loadfs: %r");
+ if(bufspc != Bufspc)
+ sysfatal("fs uses different buffer size");
+ if(hdrsz != Hdrsz)
+ sysfatal("fs uses different buffer size");
+ if(blksz != Blksz)
+ sysfatal("fs uses different block size");
if(dirty){
fprint(2, "file system was not unmounted cleanly");
/* TODO: start gc pass */
--- a/main.c
+++ b/main.c
@@ -20,7 +20,7 @@
Bptr bp;
bp = va_arg(fmt->args, Bptr);
- return fmtprint(fmt, "(%llx,%llx)", bp.addr, bp.hash);
+ return fmtprint(fmt, "(%llx,%llx,%llx)", bp.addr, bp.hash, bp.gen);
}
void
--- a/pack.c
+++ b/pack.c
@@ -269,3 +269,23 @@
}
return 0;
}
+
+char*
+packbp(char *p, Bptr *bp)
+{
+ PBIT64(p + 0, bp->addr);
+ PBIT64(p + 8, bp->hash);
+ PBIT64(p + 16, bp->gen);
+ return p + 24;
+}
+
+Bptr
+unpackbp(char *p)
+{
+ Bptr bp;
+
+ bp.addr = GBIT64(p + 0);
+ bp.hash = GBIT64(p + 8);
+ bp.gen = GBIT64(p + 16);
+ return bp;
+}
--- a/ream.c
+++ b/ream.c
@@ -42,27 +42,27 @@
static void
reamarena(Arena *a, vlong start, vlong asz)
{
- vlong off, bo, bh;
+ vlong addr, bo, bh;
char *p;
Blk *b;
- off = start;
+ addr = start;
if((b = mallocz(sizeof(Blk), 1)) == nil)
sysfatal("ream: %r");
- off += Blksz; /* arena header */
+ addr += Blksz; /* arena header */
a->log = -1;
memset(b, 0, sizeof(Blk));
b->type = Tlog;
- b->off = off;
+ b->bp.addr = addr;
b->logsz = 32;
b->data = b->buf + Hdrsz;
b->flag |= Bdirty;
p = b->data+Loghdsz;
- PBIT64(p+ 0, off|LogFree); /* off */
+ PBIT64(p+ 0, addr|LogFree); /* addr */
PBIT64(p+ 8, asz); /* len */
- PBIT64(p+16, b->off|LogAlloc); /* off */
+ PBIT64(p+16, b->bp.addr|LogAlloc); /* addr */
PBIT64(p+24, Blksz); /* len */
PBIT64(p+32, (uvlong)LogEnd); /* done */
finalize(b);
@@ -70,13 +70,13 @@
sysfatal("ream: init log");
bh = blkhash(b);
- bo = b->off;
+ bo = b->bp.addr;
memset(b, 0, sizeof(Blk));
b->type = Tarena;
- b->off = start;
+ b->bp.addr = start;
p = b->buf + Hdrsz;
- print("b->off: %llx\n", b->off);
+ print("b->bp.addr: %llx\n", b->bp.addr);
PBIT64(p+0, bo);
PBIT64(p+8, bh);
finalize(b);
@@ -126,7 +126,7 @@
}
s->type = Tsuper;
- s->off = sz;
+ s->bp.addr = sz;
s->data = s->buf + Hdrsz;
fillsuper(s);
finalize(s);
@@ -148,8 +148,7 @@
syncblk(r);
fs->super = s;
- fs->root.bp.addr = r->off;
- fs->root.bp.hash = blkhash(r);
+ fs->root.bp = r->bp;
fs->root.ht = 1;
snapshot();
--- a/tree.c
+++ b/tree.c
@@ -64,7 +64,7 @@
valsz(Kvp *kv)
{
if(kv->type == Vref)
- return 2+kv->nk + Ptrsz;
+ return 2+kv->nk + Ptrsz + Fillsz;
else
return 2+kv->nk + 2+kv->nv;
}
@@ -80,9 +80,8 @@
kv->type = Vref;
kv->nk = GBIT16(b->data + o);
kv->k = b->data + o + 2;
- kv->bp.addr = GBIT64(kv->k + kv->nk + 0);
- kv->bp.hash = GBIT64(kv->k + kv->nk + 8);
- kv->fill = GBIT16(kv->k + kv->nk + 16);
+ kv->bp = unpackbp(kv->k + kv->nk);
+ kv->fill = GBIT16(kv->k + kv->nk + Ptrsz);
}else{
kv->type = Vinl;
kv->nk = GBIT16(b->data + o);
@@ -102,7 +101,7 @@
spc = (b->type == Tleaf) ? Leafspc : Pivspc;
p = b->data + 2*i;
nk = 2 + kv->nk;
- nv = (kv->type == Vref) ? Ptrsz : 2 + kv->nv;
+ nv = (kv->type == Vref) ? Ptrsz+Fillsz : 2 + kv->nv;
if (i < 0)
i = 0;
if(!replace || b->nval == i){
@@ -142,9 +141,8 @@
PBIT16(b->data + 2*i, o);
PBIT16(p + 0, kv->nk);
memcpy(p + 2, kv->k, kv->nk);
- PBIT64(p + kv->nk + 2, kv->bp.addr);
- PBIT64(p + kv->nk + 10, kv->bp.hash);
- PBIT16(p + kv->nk + 18, kv->fill);
+ p = packbp(p + kv->nk + 2, &kv->bp);
+ PBIT16(p, kv->fill);
} else {
PBIT16(b->data + 2*i, o);
PBIT16(p + 0, kv->nk);
@@ -374,8 +372,7 @@
if(pp->l->nval > 0){
getval(pp->l, 0, &kv);
kv.type = Vref;
- kv.bp.addr = pp->l->off;
- kv.bp.hash = blkhash(pp->l);
+ kv.bp = pp->l->bp;
kv.fill = blkfill(pp->l);
setval(n, i++, &kv, 0);
if(nbytes != nil)
@@ -384,8 +381,7 @@
if(pp->r->nval > 0){
getval(pp->r, 0, &kv);
kv.type = Vref;
- kv.bp.addr = pp->r->off;
- kv.bp.hash = blkhash(pp->r);
+ kv.bp = pp->r->bp;
kv.fill = blkfill(pp->r);
setval(n, i++, &kv, 0);
if(nbytes != nil)
@@ -395,8 +391,7 @@
if(pp->n->nval > 0){
getval(pp->n, 0, &kv);
kv.type = Vref;
- kv.bp.addr = pp->n->off;
- kv.bp.hash = blkhash(pp->n);
+ kv.bp = pp->n->bp;
kv.fill = blkfill(pp->n);
setval(n, i++, &kv, 1);
if(nbytes != nil)
@@ -437,15 +432,13 @@
}else if(i == midx){
getval(p->nl, 0, &m);
m.type = Vref;
- m.bp.addr = p->nl->off;
- m.bp.hash = blkhash(p->nl);
+ m.bp = p->nl->bp;
m.fill = blkfill(p->nl);
setval(n, j++, &m, 0);
if(p->nr){
getval(p->nr, 0, &m);
m.type = Vref;
- m.bp.addr = p->nr->off;
- m.bp.hash = blkhash(p->nr);
+ m.bp = p->nr->bp;
m.fill = blkfill(p->nr);
setval(n, j++, &m, 0);
i++;
@@ -1049,7 +1042,6 @@
btupsert(Tree *t, Msg *msg, int nmsg)
{
int i, npath, redo, dh, sz, height;
- vlong rh;
Path *path;
Blk *b, *rb;
Kvp sep;
@@ -1117,12 +1109,11 @@
abort();
- assert(rb->off != 0);
- rh = blkhash(rb);
+ assert(rb->bp.addr != 0);
lock(&t->lk);
t->ht += dh;
- t->bp.addr = rb->off;
- t->bp.hash = rh;
+ t->bp = rb->bp;
+ fs->nextgen++;
unlock(&t->lk);
freepath(path, npath);
@@ -1269,17 +1260,17 @@
p[i].vi = blksearch(b, &s->kv, &v, &same);
if(p[i].vi == -1 || (p[i].vi+1 < b->nval && !same && b->type == Tleaf)){
getval(b, ++p[i].vi, &v);
- }else if(b->type == Tpivot){
+ }
+ if(b->type == Tpivot){
p[i].bi = bufsearch(b, &s->kv, &m, &same);
if(p[i].bi == -1 || !same)
p[i].bi++;
if((b = getblk(v.bp, 0)) == nil)
- return "error readivg block";
+ return "error reading block";
p[i+1].b = b;
- }else{
- assert(i == s->root.ht-1);
}
}
+ assert(i == s->root.ht);
return nil;
}
@@ -1349,6 +1340,7 @@
h = s->root.ht;
*done = 0;
start = h;
+
for(i = h-1; i > 0; i--){
if(p[i].vi < p[i].b->nval || p[i].bi < p[i].b->nbuf)
break;