shithub: pdffs

Download patch

ref: 7dd87721538f0392ba840134ecf3e7468b5d4a3f
parent: b051539715907b60f5ccb8570f4665ad45b3fbaf
author: Noam Preil <[email protected]>
date: Tue Jun 1 14:57:54 EDT 2021

add preliminary heuristic-based text generation

--- a/array.c
+++ b/array.c
@@ -10,9 +10,9 @@
 	Object *o, *m;
 	int c, noel;
 
-	o = calloc(1, sizeof(*o));
-	o->pdf = pdf;
-	o->type = Oarray;
+	o = arraynew(pdf);
+	if(o == nil)
+		return nil;
 	Sgetc(s); /* throw away '[' */
 
 	for(noel = 0;;){
@@ -45,6 +45,18 @@
 	werrstr("array: %r");
 	pdfobjfree(o);
 	return nil;
+}
+
+Object *
+arraynew(Pdf *pdf)
+{
+	Object *o;
+	o = calloc(1, sizeof(*o));
+	if(o == nil)
+		return nil;
+	o->pdf = pdf;
+	o->type = Oarray;
+	return o;
 }
 
 int
--- a/main.c
+++ b/main.c
@@ -15,6 +15,38 @@
 	threadexitsall("usage");
 }
 
+static void
+dumppage(Object *page)
+{
+	Page p;
+	pageinit(&p);
+	if(pagerender(&p, page))
+		fprint(1, "%s\n", p.text);
+	pagefree(&p);
+}
+
+static void
+dumppages(Object *pages)
+{
+	Object *page, *kids, *type;
+	int i, count;
+	kids = dictget(pages, "Kids");
+	count = arraylen(kids);
+	for(i = 0; i < count; i += 1){
+		page = arrayget(kids, i);
+		// Must be a dict, either Page or Pages
+		type = dictget(page, "Type");
+		// MUST be a name.
+		if(strcmp(type->name, "Pages") == 0)
+			dumppages(page);
+		else if(strcmp(type->name, "Page") == 0)
+			dumppage(page);
+		else
+			sysfatal("Unexpected page node type '%s'", type->name);
+	}
+}
+
+
 void
 threadmain(int argc, char **argv)
 {
@@ -49,6 +81,12 @@
 				sysfatal("write failed");
 			Sclose(s);
 			v = nil;
+			break;
+		}else if(argv[i][0] == '"' && argv[i][1] == 0 && v->type == Odict && strcmp(dictget(v, "Type")->name, "Page") == 0){
+			dumppage(v);
+			break;
+		}else if(argv[i][0] == '"' && argv[i][1] == 0 && v->type == Odict && strcmp(dictget(v, "Type")->name, "Pages") == 0){
+			dumppages(v);
 			break;
 		}else if(argv[i][0] == '*' && argv[i][1] == 0 && v->type == Odict){
 			for(k = 0; k < v->dict.nkv; k++)
--- a/op.c
+++ b/op.c
@@ -14,379 +14,395 @@
 
 struct Op {
 	char *s;
-	int (*f)(Op *op, Object *s);
+	int (*f)(Op *op, Page *p);
 	int argc;
 	int flags;
 };
 
 static int
-cobegin(Op *op, Object *s)
+cobegin(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-coend(Op *op, Object *s)
+coend(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-gspush(Op *op, Object *s)
+gspush(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-gspop(Op *op, Object *s)
+gspop(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-gsctm(Op *op, Object *s)
+gsctm(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-gswidth(Op *op, Object *s)
+gswidth(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-gscap(Op *op, Object *s)
+gscap(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-gsjoin(Op *op, Object *s)
+gsjoin(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-gsmiterlim(Op *op, Object *s)
+gsmiterlim(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-gsdash(Op *op, Object *s)
+gsdash(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-gsintent(Op *op, Object *s)
+gsintent(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-gsflatness(Op *op, Object *s)
+gsflatness(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-gsstate(Op *op, Object *s)
+gsstate(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-pcmove(Op *op, Object *s)
+pcmove(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-pcline(Op *op, Object *s)
+pcline(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-pccurve(Op *op, Object *s)
+pccurve(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-pcsubpath(Op *op, Object *s)
+pcsubpath(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-pcrect(Op *op, Object *s)
+pcrect(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-ppstroke(Op *op, Object *s)
+ppstroke(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-ppstrokec(Op *op, Object *s)
+ppstrokec(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-ppfill(Op *op, Object *s)
+ppfill(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-ppfills(Op *op, Object *s)
+ppfills(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-ppfillcfs(Op *op, Object *s)
+ppfillcfs(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-ppc(Op *op, Object *s)
+ppc(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-cpclip(Op *op, Object *s)
+cpclip(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-cspace(Op *op, Object *s)
+cspace(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-ccolour(Op *op, Object *s)
+ccolour(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-ccolour2(Op *op, Object *s)
+ccolour2(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-cgray(Op *op, Object *s)
+cgray(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-crgb(Op *op, Object *s)
+crgb(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-ccmyk(Op *op, Object *s)
+ccmyk(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-sshade(Op *op, Object *s)
+sshade(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-eoobject(Op *op, Object *s)
+eoobject(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-iibegin(Op *op, Object *s)
+iibegin(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-iidata(Op *op, Object *s)
+iidata(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-iiend(Op *op, Object *s)
+iiend(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-tsspace(Op *op, Object *s)
+tsspace(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-tswspace(Op *op, Object *s)
+tswspace(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-tshscale(Op *op, Object *s)
+tshscale(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-tslead(Op *op, Object *s)
+tslead(Op *op, Page *p)
 {
-	USED(op, s);
+	int d = arrayget(p->stack, 0)->num.d / 20;
+	while(d > 0){
+		d -= 1;
+		fprint(2, "\n");
+	}
+	USED(op, p);
 	return 0;
 }
 
 static int
-tsfontsz(Op *op, Object *s)
+tsfontsz(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-tsrendmode(Op *op, Object *s)
+tsrendmode(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-tsrise(Op *op, Object *s)
+tsrise(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-tobegin(Op *op, Object *s)
+tobegin(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-toend(Op *op, Object *s)
+toend(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-tpmove(Op *op, Object *s)
+tpmove(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-tpmatrix(Op *op, Object *s)
+tpmatrix(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-tpmove0(Op *op, Object *s)
+tpmove0(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
+	fprint(2, "\n");
 	return 0;
 }
 
 static int
-thshow(Op *op, Object *s)
+thshow(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-thshowarr(Op *op, Object *s)
+thshowarr(Op *op, Page *p)
 {
-	USED(op, s);
+	Object *arr = arrayget(p->stack, 0);
+	Object *o;
+	int i;
+	for(i = 0; i < arraylen(arr); i += 1){
+		o = arrayget(arr, i);
+		if(o->type == Ostr)
+			fprint(1, "%s", o->str);
+		else if(o->num.d < -14)
+			fprint(1, " ");
+	}
+	USED(op);
 	return 0;
 }
 
 static int
-t3width(Op *op, Object *s)
+t3width(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t3widthbb(Op *op, Object *s)
+t3widthbb(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4add(Op *op, Object *s)
+t4add(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 /*
 	double x;
 	x = objat(s+1, Onum)->num.d + objat(s+0, Onum)->num.d;
@@ -398,9 +414,9 @@
 }
 
 static int
-t4sub(Op *op, Object *s)
+t4sub(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 /*
 	double x;
 	x = objat(s+1, Onum)->num.d - objat(s+0, Onum)->num.d;
@@ -412,9 +428,9 @@
 }
 
 static int
-t4mul(Op *op, Object *s)
+t4mul(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 /*
 	double x;
 
@@ -427,9 +443,9 @@
 }
 
 static int
-t4div(Op *op, Object *s)
+t4div(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 /*
 	double x;
 
@@ -442,268 +458,268 @@
 }
 
 static int
-t4idiv(Op *op, Object *s)
+t4idiv(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4mod(Op *op, Object *s)
+t4mod(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4neg(Op *op, Object *s)
+t4neg(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4abs(Op *op, Object *s)
+t4abs(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4ceiling(Op *op, Object *s)
+t4ceiling(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4floor(Op *op, Object *s)
+t4floor(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4round(Op *op, Object *s)
+t4round(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4truncate(Op *op, Object *s)
+t4truncate(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4sqrt(Op *op, Object *s)
+t4sqrt(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4sin(Op *op, Object *s)
+t4sin(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4cos(Op *op, Object *s)
+t4cos(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4atan(Op *op, Object *s)
+t4atan(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4exp(Op *op, Object *s)
+t4exp(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4ln(Op *op, Object *s)
+t4ln(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4log(Op *op, Object *s)
+t4log(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4cvi(Op *op, Object *s)
+t4cvi(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4cvr(Op *op, Object *s)
+t4cvr(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4eq(Op *op, Object *s)
+t4eq(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4ne(Op *op, Object *s)
+t4ne(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4gt(Op *op, Object *s)
+t4gt(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4ge(Op *op, Object *s)
+t4ge(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4lt(Op *op, Object *s)
+t4lt(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4le(Op *op, Object *s)
+t4le(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4and(Op *op, Object *s)
+t4and(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4or(Op *op, Object *s)
+t4or(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4xor(Op *op, Object *s)
+t4xor(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4not(Op *op, Object *s)
+t4not(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4bitshift(Op *op, Object *s)
+t4bitshift(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4true(Op *op, Object *s)
+t4true(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4false(Op *op, Object *s)
+t4false(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4if(Op *op, Object *s)
+t4if(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4ifelse(Op *op, Object *s)
+t4ifelse(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4pop(Op *op, Object *s)
+t4pop(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4exch(Op *op, Object *s)
+t4exch(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4dup(Op *op, Object *s)
+t4dup(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4copy(Op *op, Object *s)
+t4copy(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4index(Op *op, Object *s)
+t4index(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
 static int
-t4roll(Op *op, Object *s)
+t4roll(Op *op, Page *p)
 {
-	USED(op, s);
+	USED(op, p);
 	return 0;
 }
 
@@ -856,18 +872,104 @@
 	{nil, nil, 0},
 };
 
+// If an op is found at the current position in the stream, the associated Op is
+// returned and the stream is advanced. Otherwise, nil is returned and the stream
+// is left unchanged.
 Op *
-opfind(char *name)
+opfind(Stream *s)
 {
 	int i;
+	uint len;
 	Op *op;
+	char *b = (char*)s->buf.b + s->buf.off;
 	i = 0;
-	op = &ops[0];
-	while(op->s != nil){
-		if(strcmp(op->s, name) == 0)
+	while(ops[i].s != nil){
+		op = &ops[i];
+		len = strlen(op->s);
+		if(strncmp(op->s, b, len) == 0 && (isws(b[len]) || isdelim(b[len]))){
+			s->buf.off += len;
 			return op;
+		}
 		i += 1;
-		op = &ops[i];
 	}
 	return nil;
+}
+
+void
+pageinit(Page *page)
+{
+	page->text = nil;
+	// Stack is per-content-stream, so we don't create it here
+	page->stack = nil;
+}
+
+void
+pagefree(Page *p)
+{
+	free(p->text);
+	pdfobjfree(p->stack);
+}
+
+static void
+stackreset(Object *stack)
+{
+	int i;
+	for(i = 0; i < stack->array.ne; i += 1)
+		pdfobjfree(stack->array.e[i]);
+	stack->array.ne = 0;
+	free(stack->array.e);
+	stack->array.e = nil;
+}
+
+static void
+pagerendercontent(Page *p, Object *content)
+{
+	Stream *s;
+	Object *o;
+	Op *op;
+	s = Sopen(content);
+	if(s == nil){
+		fprint(2, "%O\n", content);
+		sysfatal("%r");
+	}
+	p->stack = arraynew(content->pdf);
+	if(p->stack == nil)
+		return;
+	while(s->buf.off != s->buf.sz){
+		while(isws(s->buf.b[s->buf.off]) && s->buf.off != s->buf.sz)
+			s->buf.off += 1;
+		if(s->buf.off == s->buf.sz)
+			break;
+		op = opfind(s);
+		if(op != nil){
+			op->f(op, p);
+			stackreset(p->stack);
+		} else{
+			o = pdfobj(content->pdf, s);
+			if(o == nil){
+				fprint(2, "failed to read operand: %r\n");
+				break;
+			}
+			if(!arrayadd(p->stack, o)){
+				fprint(2, "Failed to push operand to stack: %r\n");
+				break;
+			}
+		}
+	}
+	fprint(1, "\n");
+	Sclose(s);
+}
+
+int
+pagerender(Page *p, Object *o)
+{
+	Object *content;
+	int i;
+	content = dictget(o, "Contents");
+	if(content->type == Oarray)
+		for(i = 0; i < arraylen(content); i += 1)
+			pagerendercontent(p, arrayget(content, i));
+	else if(content->type != Onull)
+		pagerendercontent(p, content);
+	return 0;
 }
--- a/pdf.h
+++ b/pdf.h
@@ -22,7 +22,12 @@
 #pragma incomplete PredictParms;
 typedef struct Stream Stream;
 typedef struct Xref Xref;
+typedef struct Page Page;
 
+struct Page {
+	Object *stack;
+	char *text;
+};
 
 struct Buffer {
 	uchar *b;
@@ -188,6 +193,7 @@
 int isutf8(char *s, int len);
 
 int arraylen(Object *o);
+Object *arraynew(Pdf *pdf);
 Object *arrayget(Object *o, int i);
 int arrayint(Object *o, int i);
 int arrayadd(Object *a, Object *o);
@@ -232,6 +238,10 @@
 int bufput(Buffer *b, uchar *d, int sz);
 int bufget(Buffer *b, uchar *d, int sz);
 void bufdump(Buffer *b);
+
+void pageinit(Page *p);
+int pagerender(Page *p, Object *o);
+void pagefree(Page *p);
 
 #pragma varargck type "O" Object*
 #pragma varargck type "T" Object*