ref: 7dd87721538f0392ba840134ecf3e7468b5d4a3f
parent: b051539715907b60f5ccb8570f4665ad45b3fbaf
author: Noam Preil <[email protected]>
date: Tue Jun 1 14:57:54 EDT 2021
add preliminary heuristic-based text generation
--- a/array.c
+++ b/array.c
@@ -10,9 +10,9 @@
Object *o, *m;
int c, noel;
- o = calloc(1, sizeof(*o));
- o->pdf = pdf;
- o->type = Oarray;
+ o = arraynew(pdf);
+ if(o == nil)
+ return nil;
Sgetc(s); /* throw away '[' */
for(noel = 0;;){
@@ -45,6 +45,18 @@
werrstr("array: %r");
pdfobjfree(o);
return nil;
+}
+
+Object *
+arraynew(Pdf *pdf)
+{
+ Object *o;
+ o = calloc(1, sizeof(*o));
+ if(o == nil)
+ return nil;
+ o->pdf = pdf;
+ o->type = Oarray;
+ return o;
}
int
--- a/main.c
+++ b/main.c
@@ -15,6 +15,38 @@
threadexitsall("usage");
}
+static void
+dumppage(Object *page)
+{
+ Page p;
+ pageinit(&p);
+ if(pagerender(&p, page))
+ fprint(1, "%s\n", p.text);
+ pagefree(&p);
+}
+
+static void
+dumppages(Object *pages)
+{
+ Object *page, *kids, *type;
+ int i, count;
+ kids = dictget(pages, "Kids");
+ count = arraylen(kids);
+ for(i = 0; i < count; i += 1){
+ page = arrayget(kids, i);
+ // Must be a dict, either Page or Pages
+ type = dictget(page, "Type");
+ // MUST be a name.
+ if(strcmp(type->name, "Pages") == 0)
+ dumppages(page);
+ else if(strcmp(type->name, "Page") == 0)
+ dumppage(page);
+ else
+ sysfatal("Unexpected page node type '%s'", type->name);
+ }
+}
+
+
void
threadmain(int argc, char **argv)
{
@@ -49,6 +81,12 @@
sysfatal("write failed");
Sclose(s);
v = nil;
+ break;
+ }else if(argv[i][0] == '"' && argv[i][1] == 0 && v->type == Odict && strcmp(dictget(v, "Type")->name, "Page") == 0){
+ dumppage(v);
+ break;
+ }else if(argv[i][0] == '"' && argv[i][1] == 0 && v->type == Odict && strcmp(dictget(v, "Type")->name, "Pages") == 0){
+ dumppages(v);
break;
}else if(argv[i][0] == '*' && argv[i][1] == 0 && v->type == Odict){
for(k = 0; k < v->dict.nkv; k++)
--- a/op.c
+++ b/op.c
@@ -14,379 +14,395 @@
struct Op {
char *s;
- int (*f)(Op *op, Object *s);
+ int (*f)(Op *op, Page *p);
int argc;
int flags;
};
static int
-cobegin(Op *op, Object *s)
+cobegin(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-coend(Op *op, Object *s)
+coend(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-gspush(Op *op, Object *s)
+gspush(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-gspop(Op *op, Object *s)
+gspop(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-gsctm(Op *op, Object *s)
+gsctm(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-gswidth(Op *op, Object *s)
+gswidth(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-gscap(Op *op, Object *s)
+gscap(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-gsjoin(Op *op, Object *s)
+gsjoin(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-gsmiterlim(Op *op, Object *s)
+gsmiterlim(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-gsdash(Op *op, Object *s)
+gsdash(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-gsintent(Op *op, Object *s)
+gsintent(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-gsflatness(Op *op, Object *s)
+gsflatness(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-gsstate(Op *op, Object *s)
+gsstate(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-pcmove(Op *op, Object *s)
+pcmove(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-pcline(Op *op, Object *s)
+pcline(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-pccurve(Op *op, Object *s)
+pccurve(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-pcsubpath(Op *op, Object *s)
+pcsubpath(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-pcrect(Op *op, Object *s)
+pcrect(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-ppstroke(Op *op, Object *s)
+ppstroke(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-ppstrokec(Op *op, Object *s)
+ppstrokec(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-ppfill(Op *op, Object *s)
+ppfill(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-ppfills(Op *op, Object *s)
+ppfills(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-ppfillcfs(Op *op, Object *s)
+ppfillcfs(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-ppc(Op *op, Object *s)
+ppc(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-cpclip(Op *op, Object *s)
+cpclip(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-cspace(Op *op, Object *s)
+cspace(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-ccolour(Op *op, Object *s)
+ccolour(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-ccolour2(Op *op, Object *s)
+ccolour2(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-cgray(Op *op, Object *s)
+cgray(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-crgb(Op *op, Object *s)
+crgb(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-ccmyk(Op *op, Object *s)
+ccmyk(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-sshade(Op *op, Object *s)
+sshade(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-eoobject(Op *op, Object *s)
+eoobject(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-iibegin(Op *op, Object *s)
+iibegin(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-iidata(Op *op, Object *s)
+iidata(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-iiend(Op *op, Object *s)
+iiend(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-tsspace(Op *op, Object *s)
+tsspace(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-tswspace(Op *op, Object *s)
+tswspace(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-tshscale(Op *op, Object *s)
+tshscale(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-tslead(Op *op, Object *s)
+tslead(Op *op, Page *p)
{
- USED(op, s);
+ int d = arrayget(p->stack, 0)->num.d / 20;
+ while(d > 0){
+ d -= 1;
+ fprint(2, "\n");
+ }
+ USED(op, p);
return 0;
}
static int
-tsfontsz(Op *op, Object *s)
+tsfontsz(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-tsrendmode(Op *op, Object *s)
+tsrendmode(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-tsrise(Op *op, Object *s)
+tsrise(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-tobegin(Op *op, Object *s)
+tobegin(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-toend(Op *op, Object *s)
+toend(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-tpmove(Op *op, Object *s)
+tpmove(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-tpmatrix(Op *op, Object *s)
+tpmatrix(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-tpmove0(Op *op, Object *s)
+tpmove0(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
+ fprint(2, "\n");
return 0;
}
static int
-thshow(Op *op, Object *s)
+thshow(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-thshowarr(Op *op, Object *s)
+thshowarr(Op *op, Page *p)
{
- USED(op, s);
+ Object *arr = arrayget(p->stack, 0);
+ Object *o;
+ int i;
+ for(i = 0; i < arraylen(arr); i += 1){
+ o = arrayget(arr, i);
+ if(o->type == Ostr)
+ fprint(1, "%s", o->str);
+ else if(o->num.d < -14)
+ fprint(1, " ");
+ }
+ USED(op);
return 0;
}
static int
-t3width(Op *op, Object *s)
+t3width(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t3widthbb(Op *op, Object *s)
+t3widthbb(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4add(Op *op, Object *s)
+t4add(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
/*
double x;
x = objat(s+1, Onum)->num.d + objat(s+0, Onum)->num.d;
@@ -398,9 +414,9 @@
}
static int
-t4sub(Op *op, Object *s)
+t4sub(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
/*
double x;
x = objat(s+1, Onum)->num.d - objat(s+0, Onum)->num.d;
@@ -412,9 +428,9 @@
}
static int
-t4mul(Op *op, Object *s)
+t4mul(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
/*
double x;
@@ -427,9 +443,9 @@
}
static int
-t4div(Op *op, Object *s)
+t4div(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
/*
double x;
@@ -442,268 +458,268 @@
}
static int
-t4idiv(Op *op, Object *s)
+t4idiv(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4mod(Op *op, Object *s)
+t4mod(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4neg(Op *op, Object *s)
+t4neg(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4abs(Op *op, Object *s)
+t4abs(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4ceiling(Op *op, Object *s)
+t4ceiling(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4floor(Op *op, Object *s)
+t4floor(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4round(Op *op, Object *s)
+t4round(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4truncate(Op *op, Object *s)
+t4truncate(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4sqrt(Op *op, Object *s)
+t4sqrt(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4sin(Op *op, Object *s)
+t4sin(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4cos(Op *op, Object *s)
+t4cos(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4atan(Op *op, Object *s)
+t4atan(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4exp(Op *op, Object *s)
+t4exp(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4ln(Op *op, Object *s)
+t4ln(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4log(Op *op, Object *s)
+t4log(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4cvi(Op *op, Object *s)
+t4cvi(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4cvr(Op *op, Object *s)
+t4cvr(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4eq(Op *op, Object *s)
+t4eq(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4ne(Op *op, Object *s)
+t4ne(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4gt(Op *op, Object *s)
+t4gt(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4ge(Op *op, Object *s)
+t4ge(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4lt(Op *op, Object *s)
+t4lt(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4le(Op *op, Object *s)
+t4le(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4and(Op *op, Object *s)
+t4and(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4or(Op *op, Object *s)
+t4or(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4xor(Op *op, Object *s)
+t4xor(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4not(Op *op, Object *s)
+t4not(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4bitshift(Op *op, Object *s)
+t4bitshift(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4true(Op *op, Object *s)
+t4true(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4false(Op *op, Object *s)
+t4false(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4if(Op *op, Object *s)
+t4if(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4ifelse(Op *op, Object *s)
+t4ifelse(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4pop(Op *op, Object *s)
+t4pop(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4exch(Op *op, Object *s)
+t4exch(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4dup(Op *op, Object *s)
+t4dup(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4copy(Op *op, Object *s)
+t4copy(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4index(Op *op, Object *s)
+t4index(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
static int
-t4roll(Op *op, Object *s)
+t4roll(Op *op, Page *p)
{
- USED(op, s);
+ USED(op, p);
return 0;
}
@@ -856,18 +872,104 @@
{nil, nil, 0},
};
+// If an op is found at the current position in the stream, the associated Op is
+// returned and the stream is advanced. Otherwise, nil is returned and the stream
+// is left unchanged.
Op *
-opfind(char *name)
+opfind(Stream *s)
{
int i;
+ uint len;
Op *op;
+ char *b = (char*)s->buf.b + s->buf.off;
i = 0;
- op = &ops[0];
- while(op->s != nil){
- if(strcmp(op->s, name) == 0)
+ while(ops[i].s != nil){
+ op = &ops[i];
+ len = strlen(op->s);
+ if(strncmp(op->s, b, len) == 0 && (isws(b[len]) || isdelim(b[len]))){
+ s->buf.off += len;
return op;
+ }
i += 1;
- op = &ops[i];
}
return nil;
+}
+
+void
+pageinit(Page *page)
+{
+ page->text = nil;
+ // Stack is per-content-stream, so we don't create it here
+ page->stack = nil;
+}
+
+void
+pagefree(Page *p)
+{
+ free(p->text);
+ pdfobjfree(p->stack);
+}
+
+static void
+stackreset(Object *stack)
+{
+ int i;
+ for(i = 0; i < stack->array.ne; i += 1)
+ pdfobjfree(stack->array.e[i]);
+ stack->array.ne = 0;
+ free(stack->array.e);
+ stack->array.e = nil;
+}
+
+static void
+pagerendercontent(Page *p, Object *content)
+{
+ Stream *s;
+ Object *o;
+ Op *op;
+ s = Sopen(content);
+ if(s == nil){
+ fprint(2, "%O\n", content);
+ sysfatal("%r");
+ }
+ p->stack = arraynew(content->pdf);
+ if(p->stack == nil)
+ return;
+ while(s->buf.off != s->buf.sz){
+ while(isws(s->buf.b[s->buf.off]) && s->buf.off != s->buf.sz)
+ s->buf.off += 1;
+ if(s->buf.off == s->buf.sz)
+ break;
+ op = opfind(s);
+ if(op != nil){
+ op->f(op, p);
+ stackreset(p->stack);
+ } else{
+ o = pdfobj(content->pdf, s);
+ if(o == nil){
+ fprint(2, "failed to read operand: %r\n");
+ break;
+ }
+ if(!arrayadd(p->stack, o)){
+ fprint(2, "Failed to push operand to stack: %r\n");
+ break;
+ }
+ }
+ }
+ fprint(1, "\n");
+ Sclose(s);
+}
+
+int
+pagerender(Page *p, Object *o)
+{
+ Object *content;
+ int i;
+ content = dictget(o, "Contents");
+ if(content->type == Oarray)
+ for(i = 0; i < arraylen(content); i += 1)
+ pagerendercontent(p, arrayget(content, i));
+ else if(content->type != Onull)
+ pagerendercontent(p, content);
+ return 0;
}
--- a/pdf.h
+++ b/pdf.h
@@ -22,7 +22,12 @@
#pragma incomplete PredictParms;
typedef struct Stream Stream;
typedef struct Xref Xref;
+typedef struct Page Page;
+struct Page {
+ Object *stack;
+ char *text;
+};
struct Buffer {
uchar *b;
@@ -188,6 +193,7 @@
int isutf8(char *s, int len);
int arraylen(Object *o);
+Object *arraynew(Pdf *pdf);
Object *arrayget(Object *o, int i);
int arrayint(Object *o, int i);
int arrayadd(Object *a, Object *o);
@@ -232,6 +238,10 @@
int bufput(Buffer *b, uchar *d, int sz);
int bufget(Buffer *b, uchar *d, int sz);
void bufdump(Buffer *b);
+
+void pageinit(Page *p);
+int pagerender(Page *p, Object *o);
+void pagefree(Page *p);
#pragma varargck type "O" Object*
#pragma varargck type "T" Object*