ref: 6b02773727cc3b9c434ef1ce5bbd94dd6b706aef
dir: /blk.c/
#include <u.h> #include <libc.h> #include <fcall.h> #include <avl.h> #include "dat.h" #include "fns.h" static vlong blkalloc_lk(Arena*); static Blk *cacheblk(Blk*); static Blk *lookupblk(vlong); Blk* readblk(vlong bp, int flg) { Blk *b; vlong off, rem, n; assert(bp != -1); if((b = malloc(sizeof(Blk))) == nil) return nil; off = bp; rem = Blksz; while(rem != 0){ n = pread(fs->fd, b->buf, rem, off); if(n <= 0){ free(b); return nil; } off += n; rem -= n; } memset(&b->RWLock, 0, sizeof(RWLock)); b->type = (flg&GBraw) ? Traw : GBIT16(b->buf+0); b->off = bp; b->cnext = nil; b->cprev = nil; b->hnext = nil; b->data = b->buf + 10; switch(b->type){ default: if(flg&GBraw) break; fprint(2, "invalid block @%llx\n", bp); abort(); break; case Tarena: case Traw: case Tsuper: case Tlog: break; case Tpivot: b->nval = GBIT16(b->buf+2); b->valsz = GBIT16(b->buf+4); b->nbuf = GBIT16(b->buf+6); b->bufsz = GBIT16(b->buf+8); break; case Tleaf: b->nval = GBIT16(b->buf+2); b->valsz = GBIT16(b->buf+4); break; } return b; } static Arena* pickarena(vlong hint) { long n; n = -1; /* shut up, ken */ if(hint > 0 || hint < fs->narena) n = hint / fs->arenasz; else if(hint == -1) n = ainc(&fs->nextarena) % fs->narena; else abort(); return &fs->arenas[n]; } static Arena* getarena(vlong b) { int i; i = b / fs->arenasz; if(i < 0 || i >= fs->narena){ werrstr("out of range block %lld", b); abort(); // return nil; } return &fs->arenas[i]; } static int freerange(Avltree *t, vlong off, vlong len) { Arange *r, *s, q; assert(len % Blksz == 0); q.off = off; q.len = len; r = (Arange*)avllookup(t, &q.Avl, -1); if(r == nil){ if((r = calloc(1, sizeof(Arange))) == nil) return -1; r->off = off; r->len = len; avlinsert(t, r); }else if(off+len == r->off){ r->len += len; r->off -= len; }else if(off == r->off + r->len){ r->len += len; }else{ r = (Arange*)avlnext(r); if(r != nil && off+len == r->off){ r->off -= len; r->len += len; } else { if((r = calloc(1, sizeof(Arange))) == nil) return -1; r->off = off; r->len = len; avlinsert(t, r); } } /* merge with previous */ s = (Arange*)avlprev(r); if(s != nil && s->off + s->len == r->off){ s->off += r->len; avldelete(t, r); } /* merge with next */ s = (Arange*)avlnext(r); if(s != nil && r->off + r->len == s->off){ r->off += r->len; avldelete(t, s); } return 0; } int grabrange(Avltree *t, vlong off, vlong len) { Arange *r, *s, q; assert(len % Blksz == 0); q.off = off; q.len = len; r = (Arange*)avllookup(t, &q.Avl, -1); if(r == nil || off + len > r->off + r->len){ fprint(2, "no blocks: %llx+%llx in:\n", off, len); for(r = (Arange*)avlmin(t); r != nil; r = (Arange*)avlnext(r)) fprint(2, "\t%llx+%llx\n", r->off, r->len); abort(); return -1; } if(off == r->off) r->off += len; else if(off == r->off + r->len) r->len -= len; else if(off + len > r->off && off > r->off + r->len){ if((s = malloc(sizeof(Arange))) == nil) return -1; s->off = off; s->len = r->len - (off - r->off) - len;; r->len = off - r->off; avlinsert(t, s); } if(r->len == 0){ avldelete(t, r); free(r); } return 0; } int synclog(Blk *b, vlong link, vlong sz) { if(sz == -1) sz = b->logsz; PBIT64(b->data+8, link); PBIT64(b->data+16, sz); finalize(b); fprint(2, "synclog: @%llx\n", b->off); showfree("synclog"); return pwrite(fs->fd, b->buf, Blksz, b->off); } Blk* logappend(Arena *a, Blk *lb, vlong off, vlong len, int op, vlong graft) { Blk *pb; vlong o, n; char *p; assert(off % Blksz == 0); if(lb == nil || lb->logsz+16+24 > Blkspc){ pb = lb; if((o = blkalloc_lk(a)) == -1) return nil; if((lb = mallocz(sizeof(Blk), 1)) == nil) return nil; lb->data = lb->buf + Hdrsz; lb->flag |= Bdirty; lb->type = Tlog; lb->off = o; lb->logsz = 0; if(synclog(lb, graft, 0) == -1) return nil; a->logtl = lb; if(pb != nil) if(synclog(pb, lb->off, -1) == -1) return nil; } n = 8; p = lb->data + 24 + lb->logsz; if(len == Blksz && op == LgAlloc) op = LgAlloc1; off |= op; PBIT64(p, off); if(op == LgAlloc || op == LgFree){ PBIT64(p+8, len); n += 8; } lb->logsz += n; return lb; } /* * Logs an allocation. Must be called * with arena lock held. Duplicates some/c * of the work in allocblk to prevent * recursion. */ int logalloc(Arena *a, vlong off, int op) { Blk *b; if((b = logappend(a, a->logtl, off, Blksz, op, -1)) == nil) return -1; if(a->logtl == nil){ a->log = b->off; a->logtl = b; } return 0; } int loadlog(Arena *a) { Blk *b; vlong bp, ent, off, len; uvlong bh; char *p, *d; int op, i, n; bp = a->log; while(bp != -1){ dprint("log block: %llx\n", bp); if((b = readblk(bp, 0)) == nil) return -1; p = b->data; bh = GBIT64(p + 0); bp = GBIT64(p + 8); b->logsz = GBIT64(p + 16); /* the hash covers the log and offset */ if(bh != siphash(p+8, Blkspc-8)){ werrstr("corrupt log"); return -1; } for(i = 0; i < b->logsz; i += n){ d = p + i + 24; ent = GBIT64(d); op = ent & 0xff; off = ent & ~0xff; switch(op){ case LgFlush: dprint("log@%d: flush: %llx\n", i, off>>8); n = 8; lock(&fs->genlk); fs->gen = off >> 8; unlock(&fs->genlk); break; case LgAlloc1: n = 8; dprint("log@%d alloc1: %llx\n", i, off); if(grabrange(a->free, off & ~0xff, Blksz) == -1) return -1; break; case LgAlloc: n = 16; len = GBIT64(d+8); dprint("log@%d alloc: %llx+%llx\n", i, off, len); if(grabrange(a->free, off & ~0xff, len) == -1) return -1; break; case LgFree: n = 16; len = GBIT64(d+8); dprint("log@%d free: %llx+%llx\n", i, off, len); if(freerange(a->free, off & ~0xff, len) == -1) return -1; break; case LgRef1: case LgUnref1: n = 8; fprint(2, "unimplemented ref op at log@%d: log op %d\n", i, op); break; default: n = 0; dprint("log@%d: log op %d\n", i, op); abort(); break; } } } return 0; } int compresslog(Arena *a) { Arange *r; vlong *log, *p, bp, hd, hh, graft; int i, n, sz; Blk *pb, *ab, *b; showfree("precompress"); fprint(2, "compress start\n"); /* * Sync the current log to disk. While * compressing the log, nothing else is * using this arena, so any allocs come * from the log compression. * * A bit of copy paste from newblk, * because otherwise we have bootstrap * issues. * * Set up the head of the new log as * an empty block. */ if((bp = blkalloc_lk(a)) == -1) return -1; if((b = mallocz(sizeof(Blk), 1)) == nil) return -1; b->type = Tlog; b->flag = Bdirty; b->off = bp; b->ref = 1; b->data = b->buf + Hdrsz; checkfs(); if(synclog(b, -1, 0) == -1) return -1; checkfs(); /* * When reaming or loading, we may not * have set up the log. */ checkfs(); if(a->logtl != nil) synclog(a->logtl, b->off, -1); checkfs(); graft = b->off; a->logtl = b; /* * Prepare what we're writing back. * Arenas must be sized so that we can * keep the merged log in memory for * a rewrite. */ n = 0; sz = 512; checkfs(); if((log = malloc(2*sz*sizeof(vlong))) == nil) return -1; checkfs(); for(r = (Arange*)avlmin(a->free); r != nil; r = (Arange*)avlnext(r)){ if(n == sz){ sz *= 2; if((p = realloc(log, 2*sz*sizeof(vlong))) == nil){ free(log); return -1; } log = p; } log[2*n+0] = r->off; log[2*n+1] = r->len; n++; } if((b = newblk(Tlog)) == nil){ free(log); return -1; } checkfs(); pb = b; hd = b->off; hh = -1; PBIT64(b->data + 8, graft); /* link */ PBIT64(b->data + 16, 0ULL); /* count */ checkfs(); for(i = 0; i < n; i++){ if((b = logappend(a, b, log[2*i], log[2*i+1], LgFree, graft)) == nil) return -1; checkfs(); if(b != pb){ checkfs(); synclog(pb, b->off, -1); checkfs(); if(pb->off == hd) hh = blkhash(pb); b = pb; } } free(log); checkfs(); if(synclog(b, graft, -1) == -1) return -1; checkfs(); if(pb->off == hd) hh = blkhash(pb); checkfs(); a->log = hd; a->logh = hh; ab = a->b; PBIT64(ab->data + 0, hd); PBIT64(ab->data + 8, hh); checkfs(); pwrite(fs->fd, ab->buf, Blksz, ab->off); checkfs(); fprint(2, "compress done\n"); return 0; } /* * Allocate from an arena, with lock * held. May be called recursively, to * alloc space for the alloc log. */ static vlong blkalloc_lk(Arena *a) { Avltree *t; Arange *r; vlong b; t = a->free; r = (Arange*)t->root; if(r == nil){ unlock(a); return -1; } /* * A bit of sleight of hand here: * while we're changing the sorting * key, but we know it won't change * the sort order because the tree * covers disjoint ranges */ b = r->off; r->len -= Blksz; r->off += Blksz; if(r->len == 0){ avldelete(t, r); free(r); } return b; } int blkrelease(vlong b) { Arena *a; int r; r = -1; a = getarena(b); lock(a); if(freerange(a->free, b, Blksz) == -1) goto out; if(logalloc(a, b, LgFree) == -1) goto out; r = 0; out: unlock(a); return r; } vlong blkalloc(vlong hint) { Arena *a; vlong b; int tries; tries = 0; again: a = pickarena(hint); if(a == nil || tries == fs->narena){ werrstr("no empty arenas"); return -1; } lock(a); /* * TODO: there's an extreme edge case * here. * * If the file system has room to alloc * a data block but no log block, then * we end up with it in a stuck state. * The fix is to reserve alloc blocks, * so that we're guaranteed to be able * to log an alloc if the disk is working * correctly. */ tries++; if((b = blkalloc_lk(a)) == -1){ unlock(a); goto again; } if(logalloc(a, b, LgAlloc) == -1){ unlock(a); return -1; } unlock(a); return b; } Blk* newblk(int t) { Blk *b; vlong bp; if((bp = blkalloc(-1)) == -1) return nil; if((b = mallocz(sizeof(Blk), 1)) == nil) return nil; b->type = t; b->flag = Bdirty; b->off = bp; b->ref = 1; b->data = b->buf + Hdrsz; return cacheblk(b); } static Blk* lookupblk(vlong off) { Bucket *bkt; u32int h; Blk *b; h = ihash(off); bkt = &fs->cache[h % fs->cmax]; lock(bkt); for(b = bkt->b; b != nil; b = b->hnext) if(b->off == off) break; if(b != nil) pinblk(b); unlock(bkt); return b; } static Blk* cacheblk(Blk *b) { Bucket *bkt; Blk *e, *c; u32int h; /* FIXME: better hash. */ assert(b->off != 0); h = ihash(b->off); // dprint("cache %lld (h=%xm, bkt=%d) => %p\n", b->off, h%fs->cmax, h, b); ainc(&b->ref); bkt = &fs->cache[h % fs->cmax]; lock(bkt); for(e = bkt->b; e != nil; e = e->hnext){ if(b == e) goto found; assert(b->off != e->off); } bkt->b = b; found: ainc(&b->ref); unlock(bkt); lock(&fs->lrulk); if(b == fs->chead) goto Cached; if(b == fs->ctail) fs->ctail = b->cprev; if(b->cnext != nil) b->cnext->cprev = b->cprev; if(b->cprev != nil) b->cprev->cnext = b->cnext; if(fs->ctail == nil) fs->ctail = b; if(fs->chead != nil) fs->chead->cprev = b; if(fs->ctail == nil) fs->ctail = b; b->cnext = fs->chead; b->cprev = nil; fs->chead = b; if((b->flag&Bcache) == 0){ b->flag |= Bcache; fs->ccount++; ainc(&b->ref); } c=0; USED(c); /* for(c = fs->ctail; c != nil && fs->ccount >= fs->cmax; c = fs->ctail){ fs->ctail = c->cprev; fs->ccount--; putblk(c); } */ Cached: unlock(&fs->lrulk); return b; } static void cachedel(Blk *del) { Bucket *bkt; Blk *b, **p; u32int h; /* FIXME: better hash. */ h = ihash(del->off); bkt = &fs->cache[h % fs->cmax]; lock(bkt); p = &bkt->b; for(b = bkt->b; b != nil; b = b->hnext){ if(b == del){ *p = del->hnext; break; } p = &b->hnext; } unlock(bkt); lock(&fs->lrulk); if(b->cnext != nil) b->cnext->cprev = b->cprev; if(b->cprev != nil) b->cprev->cnext = b->cnext; if(fs->ctail == b) fs->ctail = b->cprev; if(fs->chead == nil) fs->chead = b; unlock(&fs->lrulk); } void enqueue(Blk *b) { if(pwrite(fs->fd, b->buf, Blksz, b->off) == -1){ ainc(&fs->broken); fprint(2, "write: %r"); return; } wlock(b); b->flag &= ~(Bqueued|Bdirty|Bfinal); wunlock(b); } void fillsuper(Blk *b) { char *p; assert(b->type == Tsuper); p = b->data; memcpy(p + 0, "gefs0001", 8); PBIT32(p + 8, 0); /* dirty */ PBIT32(p + 12, Blksz); PBIT32(p + 16, Bufspc); PBIT32(p + 20, Hdrsz); PBIT32(p + 24, fs->height); PBIT64(p + 32, fs->rootb); PBIT64(p + 40, fs->rooth); PBIT32(p + 48, fs->narena); PBIT64(p + 56, fs->arenasz); PBIT64(p + 64, fs->gen); PBIT64(p + 72, fs->nextqid); } void finalize(Blk *b) { vlong h; // assert((b->flag & Bfinal) == 0); b->flag |= Bfinal; PBIT16(b->buf, b->type); switch(b->type){ default: case Tnone: abort(); break; case Tpivot: PBIT16(b->buf+2, b->nval); PBIT16(b->buf+4, b->valsz); PBIT16(b->buf+6, b->nbuf); PBIT16(b->buf+8, b->bufsz); break; case Tleaf: PBIT16(b->buf+2, b->nval); PBIT16(b->buf+4, b->valsz); break; case Tlog: h = siphash(b->data + 8, Blkspc-8); PBIT64(b->data, h); case Tsuper: case Tarena: case Traw: break; } } Blk* getblk(vlong bp, uvlong bh) { Blk *b; if((b = lookupblk(bp)) == nil){ if((b = readblk(bp, 0)) == nil) return nil; if(siphash(b->buf, Blksz) != bh){ werrstr("corrupt block %llx", bp); return nil; } } return cacheblk(b); } Blk* dupblk(vlong bp, uvlong bh) { USED(bp, bh); return nil; } Blk* pinblk(Blk *b) { ainc(&b->ref); return b; } int refblk(Blk *b) { Arena *a; int r; a = getarena(b->off); lock(a); r = logalloc(a, b->off, LgRef1); unlock(a); return r; } int unrefblk(Blk *b) { Arena *a; int r; a = getarena(b->off); lock(a); r = logalloc(a, b->off, LgUnref1); unlock(a); return r; } ushort blkfill(Blk *b) { switch(b->type){ case Tpivot: return 2*b->nbuf + b->bufsz + 2*b->nval + b->valsz; case Tleaf: return 2*b->nval + b->valsz; default: fprint(2, "invalid block @%lld\n", b->off); abort(); } return 0; // shut up kencc } void putblk(Blk *b) { if(b == nil) return; if((b->flag & (Bdirty|Bqueued)) == Bdirty) enqueue(b); if(adec(&b->ref) == 0){ cachedel(b); free(b); } } void freeblk(Blk *b) { Arena *a; assert(b->ref == 1 && b->flag & (Bdirty|Bqueued) == Bdirty); a = getarena(b->off); lock(a); /* * TODO: what to do if we fail to log a free here?? * This is already an error path! */ logalloc(a, b->off, LgRef1); unlock(a); free(b); } int sync(void) { int i, r; Blk *b; dprint("syncing\n"); r = 0; for(i = 0; i < fs->narena; i++) if(synclog(fs->arenas[i].logtl, -1, -1) == -1) r = -1; for(b = fs->chead; b != nil; b = b->cnext){ // dprint("sync %p\n", b); if(!(b->flag & Bdirty)) continue; if(pwrite(fs->fd, b->buf, Blksz, b->off) == -1) r = -1; } return r; }