shithub: git9

Download patch

ref: 3c7125f599580538d08a1380b80ebd1f2323a171
parent: 6794de8f796f03ac666c80cec116c60e33206ff4
author: Ori Bernstein <[email protected]>
date: Sat Dec 12 23:23:59 EST 2020

packfiles: redo deltification

Our deltification sucked a lot. It was very slow, and in
addition it didn't deltify much. The new implementation
is based around some ideas from FastCDC, and it actually
works reasonably well.

In addition, deltas between different object types are
disabled, and commits aren't deltified because it makes
many operations slow.

--- a/delta.c
+++ b/delta.c
@@ -4,22 +4,62 @@
 #include "git.h"
 
 enum {
-	K	= 3,
-	Bconst	= 42,
-	Bmask	= 0x7f,
-	Bshift	= 7,
-	Hshift	= 113,
-	Hlast	= 1692137473L,
+	Minchunk	= 128,
+	Maxchunk	= 8192,
+	Splitmask	= (1<<8)-1,
+	
 };
 
+static u32int geartab[] = {
+    0x67ed26b7, 0x32da500c, 0x53d0fee0, 0xce387dc7, 0xcd406d90, 0x2e83a4d4, 0x9fc9a38d, 0xb67259dc,
+    0xca6b1722, 0x6d2ea08c, 0x235cea2e, 0x3149bb5f, 0x1beda787, 0x2a6b77d5, 0x2f22d9ac, 0x91fc0544,
+    0xe413acfa, 0x5a30ff7a, 0xad6fdde0, 0x444fd0f5, 0x7ad87864, 0x58c5ff05, 0x8d2ec336, 0x2371f853,
+    0x550f8572, 0x6aa448dd, 0x7c9ddbcf, 0x95221e14, 0x2a82ec33, 0xcbec5a78, 0xc6795a0d, 0x243995b7,
+    0x1c909a2f, 0x4fded51c, 0x635d334b, 0x0e2b9999, 0x2702968d, 0x856de1d5, 0x3325d60e, 0xeb6a7502,
+    0xec2a9844, 0x0905835a, 0xa1820375, 0xa4be5cab, 0x96a6c058, 0x2c2ccd70, 0xba40fce3, 0xd794c46b,
+    0x8fbae83e, 0xc3aa7899, 0x3d3ff8ed, 0xa0d42b5b, 0x571c0c97, 0xd2811516, 0xf7e7b96c, 0x4fd2fcbd,
+    0xe2fdec94, 0x282cc436, 0x78e8e95c, 0x80a3b613, 0xcfbee20c, 0xd4a32d1c, 0x2a12ff13, 0x6af82936,
+    0xe5630258, 0x8efa6a98, 0x294fb2d1, 0xdeb57086, 0x5f0fddb3, 0xeceda7ce, 0x4c87305f, 0x3a6d3307,
+    0xe22d2942, 0x9d060217, 0x1e42ed02, 0xb6f63b52, 0x4367f39f, 0x055cf262, 0x03a461b2, 0x5ef9e382,
+    0x386bc03a, 0x2a1e79c7, 0xf1a0058b, 0xd4d2dea9, 0x56baf37d, 0x5daff6cc, 0xf03a951d, 0xaef7de45,
+    0xa8f4581e, 0x3960b555, 0xffbfff6d, 0xbe702a23, 0x8f5b6d6f, 0x061739fb, 0x98696f47, 0x3fd596d4,
+    0x151eac6b, 0xa9fcc4f5, 0x69181a12, 0x3ac5a107, 0xb5198fe7, 0x96bcb1da, 0x1b5ddf8e, 0xc757d650,
+    0x65865c3a, 0x8fc0a41a, 0x87435536, 0x99eda6f2, 0x41874794, 0x29cff4e8, 0xb70efd9a, 0x3103f6e7,
+    0x84d2453b, 0x15a450bd, 0x74f49af1, 0x60f664b1, 0xa1c86935, 0xfdafbce1, 0xe36353e3, 0x5d9ba739,
+    0xbc0559ba, 0x708b0054, 0xd41d808c, 0xb2f31723, 0x9027c41f, 0xf136d165, 0xb5374b12, 0x9420a6ac,
+    0x273958b6, 0xe6c2fad0, 0xebdc1f21, 0xfb33af8b, 0xc71c25cd, 0xe9a2d8e5, 0xbeb38a50, 0xbceb7cc2,
+    0x4e4e73f0, 0xcd6c251d, 0xde4c032c, 0x4b04ac30, 0x725b8b21, 0x4eb8c33b, 0x20d07b75, 0x0567aa63,
+    0xb56b2bb7, 0xc1f5fd3a, 0xcafd35ca, 0x470dd4da, 0xfe4f94cd, 0xfb8de424, 0xe8dbcf40, 0xfe50a37a,
+    0x62db5b5d, 0xf32f4ab6, 0x2c4a8a51, 0x18473dc0, 0xfe0cbb6e, 0xfe399efd, 0xdf34ecc9, 0x6ccd5055,
+    0x46097073, 0x139135c2, 0x721c76f6, 0x1c6a94b4, 0x6eee014d, 0x8a508e02, 0x3da538f5, 0x280d394f,
+    0x5248a0c4, 0x3ce94c6c, 0x9a71ad3a, 0x8493dd05, 0xe43f0ab6, 0x18e4ed42, 0x6c5c0e09, 0x42b06ec9,
+    0x8d330343, 0xa45b6f59, 0x2a573c0c, 0xd7fd3de6, 0xeedeab68, 0x5c84dafc, 0xbbd1b1a8, 0xa3ce1ad1,
+    0x85b70bed, 0xb6add07f, 0xa531309c, 0x8f8ab852, 0x564de332, 0xeac9ed0c, 0x73da402c, 0x3ec52761,
+    0x43af2f4d, 0xd6ff45c8, 0x4c367462, 0xd553bd6a, 0x44724855, 0x3b2aa728, 0x56e5eb65, 0xeaf16173,
+    0x33fa42ff, 0xd714bb5d, 0xfbd0a3b9, 0xaf517134, 0x9416c8cd, 0x534cf94f, 0x548947c2, 0x34193569,
+    0x32f4389a, 0xfe7028bc, 0xed73b1ed, 0x9db95770, 0x468e3922, 0x0440c3cd, 0x60059a62, 0x33504562,
+    0x2b229fbd, 0x5174dca5, 0xf7028752, 0xd63c6aa8, 0x31276f38, 0x0646721c, 0xb0191da8, 0xe00e6de0,
+    0x9eac1a6e, 0x9f7628a5, 0xed6c06ea, 0x0bb8af15, 0xf119fb12, 0x38693c1c, 0x732bc0fe, 0x84953275,
+    0xb82ec888, 0x33a4f1b3, 0x3099835e, 0x028a8782, 0x5fdd51d7, 0xc6c717b3, 0xb06caf71, 0x17c8c111,
+    0x61bad754, 0x9fd03061, 0xe09df1af, 0x3bc9eb73, 0x85878413, 0x9889aaf2, 0x3f5a9e46, 0x42c9f01f,
+    0x9984a4f4, 0xd5de43cc, 0xd294daed, 0xbecba2d2, 0xf1f6e72c, 0x5551128a, 0x83af87e2, 0x6f0342ba,
+};
+
+static u64int
+hash(void *p, int n)
+{
+	uchar buf[SHA1dlen];
+	sha1((uchar*)p, n, buf, nil);
+	return GETBE64(buf);
+}
+
 static void
-addblk(Dtab *dt, void *buf, int len, int off, u64int rh)
+addblk(Dtab *dt, void *buf, int len, int off, u64int h)
 {
 	int i, sz, probe;
 	Dblock *db;
 
-	rh >>= Bshift;
-	probe = rh % dt->sz;
+	probe = h % dt->sz;
 	while(dt->b[probe].buf != nil){
 		if(len == dt->b[probe].len && memcmp(buf, dt->b[probe].buf, len) == 0)
 			return;
@@ -29,7 +69,7 @@
 	dt->b[probe].buf = buf;
 	dt->b[probe].len = len;
 	dt->b[probe].off = off;
-	dt->b[probe].rhash = rh;
+	dt->b[probe].hash = h;
 	dt->nb++;
 	if(dt->sz < 2*dt->nb){
 		sz = dt->sz;
@@ -39,58 +79,85 @@
 		dt->b = eamalloc(dt->sz, sizeof(Dblock));
 		for(i = 0; i < sz; i++)
 			if(db[i].buf != nil)
-				addblk(dt, db[i].buf, db[i].len, db[i].off, db[i].rhash);
+				addblk(dt, db[i].buf, db[i].len, db[i].off, db[i].hash);
 		free(db);
 	}		
 }
 
 static Dblock*
-findrough(Dtab *dt, u64int rh)
+lookup(Dtab *dt, uchar *p, int n)
 {
 	int probe;
+	u64int h;
 
-	rh >>= Bshift;
-	for(probe = rh % dt->sz; dt->b[probe].buf != nil; probe = (probe + 1) % dt->sz)
-		if(dt->b[probe].rhash == rh)
-			return &dt->b[probe];
+	h = hash(p, n);
+	for(probe = h % dt->sz; dt->b[probe].buf != nil; probe = (probe + 1) % dt->sz){
+		if(dt->b[probe].hash != h)
+			continue;
+		if(n != dt->b[probe].len)
+			continue;
+		if(memcmp(p, dt->b[probe].buf, n) != 0)
+			continue;
+		return &dt->b[probe];
+	}
 	return nil;
 }
 
 static int
-nextblk(uchar *s, uchar *e, u64int *ph)
+nextblk(uchar *s, uchar *e)
 {
-	u64int rh;
+	u32int gh;
 	uchar *p;
-	int i;
 
-	p = s;
-	rh = 0;
-	for(i = 0; (rh & Bmask) != Bconst; i++){
-		if(p == e)
+	if((e - s) < Minchunk)
+		return e - s;
+	p = s + Minchunk;
+	if((e - s) > Maxchunk)
+		e = s + Maxchunk;
+	gh = 0;
+	while(p != e){
+		gh = (gh<<1) + geartab[*p++];
+		if((gh & Splitmask) == 0)
 			break;
-		/* FIXME: better hash */
-		rh *= Hshift;
-		rh += *p++ + K;
 	}
-	*ph = rh;
 	return p - s;
 }
 
-static int
-sameblk(Dblock *b, uchar *s, uchar *e)
+void
+dtinit(Dtab *dt, void *base, int nbase)
 {
-	if(b->len != e - s)
-		return 0;
-	return memcmp(b->buf, s, b->len) == 0;
+	uchar *s, *e;
+	u64int h;
+	vlong n, o;
+	
+	o = 0;
+	s = base;
+	e = s + nbase;
+	dt->nb = 0;
+	dt->sz = 128;
+	dt->b = eamalloc(dt->sz, sizeof(Dblock));
+	dt->base = base;
+	dt->nbase = nbase;
+	while(s != e){
+		n = nextblk(s, e);
+		h = hash(s, n);
+		addblk(dt, s, n, o, h);
+		s += n;
+		o += n;
+	}
 }
 
+void
+dtclear(Dtab *dt)
+{
+	free(dt->b);
+}
+
 static int
 emitdelta(Delta **pd, int *nd, int cpy, int off, int len)
 {
 	Delta *d;
 
-	if(len == 0)
-		return 0;
 	*nd += 1;
 	*pd = erealloc(*pd, *nd * sizeof(Delta));
 	d = &(*pd)[*nd - 1];
@@ -100,81 +167,52 @@
 	return len;
 }
 
-void
-dtinit(Dtab *dt, void *base, int nbase)
+static int
+stretch(Dtab *dt, Dblock *b, uchar *s, uchar *e, int n)
 {
-	uchar *bp, *s, *e;
-	u64int rh;
-	
-	bp = base;
-	s = bp;
-	e = bp;
-	rh = 0;
-	dt->nb = 0;
-	dt->sz = 128;
-	dt->b = eamalloc(dt->sz, sizeof(Dblock));
-	while(e != bp + nbase){
-		e += nextblk(s, bp + nbase, &rh);
-		addblk(dt, s, e - s, s - bp, rh);
-		s = e;
+	uchar *p, *q, *eb;
+
+	if(b == nil)
+		return n;
+	p = s + n;
+	q = dt->base + b->off + n;
+	eb = dt->base + dt->nbase;
+	while(n < (1<<24)){
+		if(p == e || q == eb)
+			break;
+		if(*p != *q)
+			break;
+		p++;
+		q++;
+		n++;
 	}
+	return n;
 }
 
-void
-dtclear(Dtab *dt)
-{
-	free(dt->b);
-}
-
 Delta*
 deltify(void *targ, int ntarg, Dtab *dt, int *pnd)
 {
-	Dblock *k;
 	Delta *d;
-	uchar *l, *s, *e, *eb, *tp;
-	int i, nd, nb;
-	u64int rh;
-
-
-	tp = targ;
-	l = targ;
-	s = targ;
-	e = targ;
+	Dblock *b;
+	uchar *s, *e;
+	vlong n, o;
+	
+	o = 0;
 	d = nil;
-	nd = 0;
-	rh = 0;
-	e += nextblk(s, tp + ntarg, &rh);
-	while(1){
-		if((rh & Bmask) == Bconst && (k = findrough(dt, rh)) != nil){
-			if(sameblk(k, s, e)){
-				nb = k->len;
-				eb = k->buf + k->len;
-				/* stretch the block: 1<<24 is the max packfiles support. */
-				for(i = 0; i < (1<<24) - nb; i++){
-					if(e == tp + ntarg || eb == dt->base + dt->nbase)
-						break;
-					if(*e != *eb)
-						break;
-					nb++;
-					eb++;
-					e++;
-				}
-				emitdelta(&d, &nd, 0, l - tp, s - l);
-				emitdelta(&d, &nd, 1, k->off, nb);
-				s = e;
-				l = e;
-				e += nextblk(s, tp + ntarg, &rh);
-				continue;
-			}
-		}
-		if(e == tp + ntarg)
-			break;
-		/* got a better hash? apply within! */
-		rh -= *s++ * Hlast;
-		rh *= Hshift;
-		rh += *e++ + K;
+	s = targ;
+	e = s + ntarg;
+	*pnd = 0;
+	while(s != e){
+		n = nextblk(s, e);
+		b = lookup(dt, s, n);
+		n = stretch(dt, b, s, e, n);
+		if(b != nil)
+			emitdelta(&d, pnd, 1, b->off, n);
+		else
+			emitdelta(&d, pnd, 0, o, n);
+		s += n;
+		o += n;
 	}
-	emitdelta(&d, &nd, 0, l - tp, tp + ntarg - l);
-	*pnd = nd;
+	assert(o == ntarg);
 	return d;
 }
--- a/git.h
+++ b/git.h
@@ -162,7 +162,7 @@
 	uchar	*buf;
 	int	len;
 	int	off;
-	u64int	rhash;
+	u64int	hash;
 };
 
 struct Delta {
--- a/pack.c
+++ b/pack.c
@@ -7,6 +7,7 @@
 typedef struct Buf	Buf;
 typedef struct Objmeta	Objmeta;
 typedef struct Compout	Compout;
+typedef struct Packf	Packf;
 
 #define max(x, y) ((x) > (y) ? (x) : (y))
 
@@ -42,6 +43,13 @@
 	char *data;
 };
 
+struct Packf {
+	char	*path;
+	int	refs;
+	int	idxfd;
+	int	packfd;
+};
+
 static int	readpacked(Biobuf *, Object *, int);
 static Object	*readidxobject(Biobuf *, Hash, int);
 
@@ -50,6 +58,8 @@
 Object *lrutail;
 int	ncache;
 int	cachemax = 1024;
+Packf	*packf;
+int	npackf;
 
 static void
 clear(Object *o)
@@ -150,7 +160,81 @@
 	}		
 }
 
+static Packf*
+findpackf(char *path)
+{
+	int np, pfd, ifd;
+	char buf[192];
+	Packf *pf;
 
+	np = strlen(path);
+	if(np > 4 && strcmp(path + np - 4, ".idx") == 0)
+		np -= 4;
+	else if(np > 5 && strcmp(path + np - 5, ".pack") == 0)
+		np -= 5;
+	else{
+		sysfatal("invalid pack path %s", path);
+		return nil;
+	}
+		
+	for(pf = &packf[0]; pf != &packf[npackf]; pf++)
+		if(strncmp(pf->path, path, np) == 0 && strlen(pf->path) == np)
+			return pf;
+	snprint(buf, sizeof(buf), ".git/objects/pack/%.*s.idx", np, path);
+	ifd = open(buf, OREAD);
+	snprint(buf, sizeof(buf), ".git/objects/pack/%.*s.pack", np, path);
+	pfd = open(buf, OREAD);
+	if(ifd == -1 || pfd == -1)
+		goto error;
+	packf = earealloc(packf, ++npackf, sizeof(Packf));
+	pf = &packf[npackf-1];
+	pf->path = smprint("%.*s", np, path);
+	pf->idxfd = ifd;
+	pf->packfd = pfd;
+	return pf;
+
+error:
+	print("pack open error for %s: %r\n", path);
+	if(ifd != -1) close(ifd);
+	if(pfd != -1) close(pfd);
+	return nil;
+}
+
+static Biobuf*
+packopen(char *path)
+{
+	Biobuf *bfd;
+	Packf *pf;
+
+	if((pf = findpackf(path)) == nil)
+		return nil;
+	bfd = emalloc(sizeof(Biobuf));
+	Binit(bfd, pf->packfd, OREAD);
+	return bfd;
+
+	
+}
+
+static Biobuf*
+idxopen(char *path)
+{
+	Biobuf *bfd;
+	Packf *pf;
+
+	if((pf = findpackf(path)) == nil)
+		return nil;
+	bfd = emalloc(sizeof(Biobuf));
+	Binit(bfd, pf->idxfd, OREAD);
+	return bfd;
+}
+
+static void
+pfclose(Biobuf *f)
+{
+	Bterm(f);
+	free(f);
+}
+
 static u32int
 crc32(u32int crc, char *b, int nb)
 {
@@ -888,29 +972,23 @@
 		l = strlen(d[i].name);
 		if(l > 4 && strcmp(d[i].name + l - 4, ".idx") != 0)
 			continue;
-		snprint(path, sizeof(path), ".git/objects/pack/%s", d[i].name);
-		if((f = Bopen(path, OREAD)) == nil)
+		if((f = idxopen(d[i].name)) == nil)
 			continue;
 		o = searchindex(f, h);
-		Bterm(f);
-		if(o == -1)
-			continue;
-		break;
+		pfclose(f);
+		if(o != -1)
+			break;
 	}
-
 	if (o == -1)
 		goto error;
 
-	if((n = snprint(path, sizeof(path), "%s", path)) >= sizeof(path) - 4)
+	if((f = packopen(d[i].name)) == nil)
 		goto error;
-	memcpy(path + n - 4, ".pack", 6);
-	if((f = Bopen(path, OREAD)) == nil)
-		goto error;
 	if(Bseek(f, o, 0) == -1)
 		goto errorf;
 	if(readpacked(f, obj, flag) == -1)
 		goto errorf;
-	Bterm(f);
+	pfclose(f);
 	parseobject(obj);
 	free(d);
 	cache(obj);
@@ -1069,7 +1147,8 @@
 		}
 		nvalid = n;
 	}
-	fprint(2, "\b\b\b\b100%%\n");
+	if(interactive)
+		fprint(2, "\b\b\b\b100%%\n");
 	Bterm(f);
 
 	st = nil;
@@ -1294,19 +1373,21 @@
 	for(i = 0; i < nmeta; i++){
 		m = meta[i];
 		pct = showprogress((i*100) / nmeta, pct);
-		if((a = readobject(m->obj->hash)) == nil)
-			sysfatal("readobject %H: %r", m->obj->hash);
-		best = a->size;
 		m->base = nil;
 		m->delta = nil;
 		m->ndelta = 0;
+		if(m->obj->type == GCommit || m->obj->type == GTag)
+			continue;
+		if((a = readobject(m->obj->hash)) == nil)
+			sysfatal("readobject %H: %r", m->obj->hash);
+		best = a->size;
 		dtinit(&m->dtab, a->data, a->size);
 		if(i >= 11)
 			dtclear(&meta[i-11]->dtab);
 		for(j = max(0, i - 10); j < i; j++){
 			p = meta[j];
-			/* lets not make the chains too long */
-			if(p->nchain >= 4095)
+			/* long chains make unpacking slow */
+			if(p->nchain >= 128 || p->obj->type != a->type)
 				continue;
 			if((b = readobject(p->obj->hash)) == nil)
 				sysfatal("readobject %H: %r", p->obj->hash);
--- a/util.c
+++ b/util.c
@@ -317,7 +317,7 @@
 		return 0;
 	if(x > pct){
 		pct = x;
-		fprint(1, "\b\b\b\b%3d%%", pct);
+		fprint(2, "\b\b\b\b%3d%%", pct);
 	}
 	return pct;
 }