shithub: git9

Download patch

ref: 7f781ad522b14fbe48d8b4991c65eea831f89b08
parent: b18f03aaf704b2788c60bd6a3f3ac82db85f4ec3
author: Ori Bernstein <[email protected]>
date: Sat Sep 5 16:50:45 EDT 2020

deltification: use hash table for rolling hashes

Why were we doing a binary search? Slower *and* more
code.

--- a/delta.c
+++ b/delta.c
@@ -5,9 +5,7 @@
 
 typedef struct Dblock	Dblock;
 typedef struct Delta	Delta;
-typedef struct Dset	Dset;
-typedef struct Objmeta	Objmeta;
-typedef struct Deltaset	Deltaset;
+typedef struct Dtab	Dtab;
 
 enum {
 	Blksz	= 127,
@@ -16,29 +14,19 @@
 };
 
 struct Dblock {
-	uchar	*p;
-	uchar	*s;
-	uchar	*e;
+	uchar	*buf;
+	int	len;
+	int	off;
 	u64int	rhash;
 	Hash	chash;
 };
 
-struct Objmeta {
-	Object	*obj;
-	char	*name;
-	vlong	time;
-	Delta	*delta;
-	int	ndelta;
+struct Dtab {
+	Dblock	*b;
+	int	nb;
+	int	sz;
 };
 
-struct Deltaset {
-	Objset	skip;
-	Objset	send;
-	Objmeta	*meta;
-	int	nmeta;
-	int	metasz;
-};
-
 static u64int
 addh(u64int h, uchar v)
 {
@@ -45,44 +33,46 @@
 	return h + v;
 }
 
-static int
-blkcmp(void *pa, void *pb)
+static void
+addblk(Dtab *dt, void *buf, int len, int off, u64int rh)
 {
-	Dblock *a, *b;
+	int i, sz, probe;
+	Dblock *db;
 
-	a = (Dblock*)pa;
-	b = (Dblock*)pb;
-	if(b->rhash == a->rhash)
-		return 0;
-	return (a->rhash > b->rhash) ? -1 : 1;
+	probe = rh % dt->sz;
+	while(dt->b[probe].buf != nil){
+		if(len == dt->b[probe].len && memcmp(buf, dt->b[probe].buf, len) == 0)
+			return;
+		probe = (probe + 1) % dt->sz;
+	}
+	assert(dt->b[probe].buf == nil);
+	dt->b[probe].buf = buf;
+	dt->b[probe].len = len;
+	dt->b[probe].off = off;
+	dt->b[probe].rhash = rh;
+	dt->nb++;
+	sha1(buf, len, dt->b[probe].chash.h, nil);
+	if(dt->sz < 2*dt->nb){
+		sz = dt->sz;
+		db = dt->b;
+		dt->sz *= 2;
+		dt->nb = 0;
+		dt->b = emalloc(dt->sz * sizeof(Dblock));
+		for(i = 0; i < sz; i++)
+			if(db[i].buf != nil)
+				addblk(dt, db[i].buf, db[i].len, db[i].off, db[i].rhash);
+		free(db);
+	}		
 }
 
-static void
-initblk(Dblock *b, uchar *p, uchar *s, uchar *e, u64int rh)
-{
-	b->p = p;
-	b->s = s;
-	b->e = e;
-	b->rhash = rh;
-	sha1(s, e - s, b->chash.h, nil);
-}
-
 static Dblock*
-findrough(Dblock *b, int nb, u64int rh)
+findrough(Dtab *dt, u64int rh)
 {
-	int mid, lo, hi;
+	int probe;
 
-	lo = 0;
-	hi = nb;
-	while(lo <= hi){
-		mid = (lo + hi)/2;
-		if(b[mid].rhash == rh)
-			return &b[mid];
-		else if(b[mid].rhash > rh)
-			lo = mid + 1;
-		else
-			hi = mid - 1;
-	}
+	for(probe = rh % dt->sz; dt->b[probe].buf != nil; probe++)
+		if(dt->b[probe].rhash == rh)
+			return &dt->b[probe];
 	return nil;
 }
 
@@ -107,12 +97,9 @@
 static int
 sameblk(Dblock *b, Hash h, uchar *s, uchar *e)
 {
-	int n;
-
-	n = b->e - b->s;
-	if(n != e - s)
+	if(b->len != e - s)
 		return 0;
-	return hasheq(&b->chash, &h) && memcmp(b->s, s, n) == 0;
+	return hasheq(&b->chash, &h) && memcmp(b->buf, s, b->len) == 0;
 }
 
 static int
@@ -136,10 +123,11 @@
 deltify(void *targ, int ntarg, void *base, int nbase, int *pnd)
 {
 	Hash h;
-	Dblock *b, *k;
+	Dblock *k;
 	Delta *d;
+	Dtab dt;
 	uchar *l, *s, *e, *eb, *bp, *tp;
-	int i, nb, nd;
+	int i, nd, nb;
 	u64int rh;
 
 	bp = base;
@@ -146,14 +134,14 @@
 	tp = targ;
 	s = bp;
 	e = bp;
-	nb = (nbase + Blksz - 1) / Blksz;
-	b = emalloc(nb*sizeof(Dblock));
-	for(i = 0; i < nb; i++){
+	dt.b = emalloc(16*sizeof(Dblock));
+	dt.nb = 0;
+	dt.sz = 16;
+	while(e != bp + nbase){
 		e += nextblk(s, bp + nbase, &rh);
-		initblk(&b[i], bp, s, e, rh);
+		addblk(&dt, s, e - s, s - bp, rh);
 		s = e;
 	}
-	qsort(b, nb, sizeof(*b), blkcmp);
 
 	l = targ;
 	s = targ;
@@ -162,10 +150,11 @@
 	nd = 0;
 	e += nextblk(s, tp + ntarg, &rh);
 	while(1){
-		if((k = findrough(b, nb, rh)) != nil){
+		if((k = findrough(&dt, rh)) != nil){
 			sha1(s, e - s, h.h, nil);
 			if(sameblk(k, h, s, e)){
-				eb = k->e;
+				nb = k->len;
+				eb = k->buf + k->len;
 				/* stretch the block as far as it'll go */
 				for(i = 0; i < (1<<24) - Blksz; i++){
 					if(e == tp + ntarg || eb == bp + nbase)
@@ -172,11 +161,12 @@
 						break;
 					if(*e != *eb)
 						break;
+					nb++;
 					eb++;
 					e++;
 				}
 				emitdelta(&d, &nd, 0, l - tp, s - l);
-				emitdelta(&d, &nd, 1, k->s - k->p, eb - k->s);
+				emitdelta(&d, &nd, 1, k->off, nb);
 				s = e;
 				l = e;
 				e += nextblk(s, tp + ntarg, &rh);
@@ -192,6 +182,6 @@
 	}
 	emitdelta(&d, &nd, 0, l - tp, tp + ntarg - l);
 	*pnd = nd;
-	free(b);
+	free(dt.b);
 	return d;
 }