shithub: neatroff

Download patch

ref: 5d20e93d9fcd37af03128fc712b282c7e9993b98
parent: 0065e9a53fb42e3116a469d44610c3c39d48c834
author: Ali Gholami Rudi <[email protected]>
date: Wed Apr 30 11:59:09 EDT 2014

fmt: new hyphenation support with penalties

The .hyp request can specify the hyphenation penalty.  An argument of
1000 assigns the cost of formatting an empty line to each hyphenation;
a value between 0 and 100 seems feasible for common line lengths.

--- a/fmt.c
+++ b/fmt.c
@@ -27,6 +27,8 @@
 	int wid;	/* word's width */
 	int elsn, elsp;	/* els_neg and els_pos */
 	int gap;	/* the space before this word */
+	int hy;		/* hyphen width if inserted after this word */
+	int str;	/* does the spece before it stretch */
 };
 
 struct line {
@@ -92,6 +94,12 @@
 			*els_pos = wcur->elsp;
 		free(wcur->s);
 	}
+	if (beg < end) {
+		wcur = &f->words[end - 1];
+		if (wcur->hy)
+			sbuf_append(s, "\\(hy");
+		w += wcur->hy;
+	}
 	return w;
 }
 
@@ -101,59 +109,19 @@
 	int i, w = 0;
 	for (i = beg; i < end; i++)
 		w += f->words[i].wid + f->words[i].gap;
-	return w;
+	return beg < end ? w + f->words[end - 1].hy : 0;
 }
 
-static char *fmt_strdup(char *s)
+/* the number stretchable spaces in f */
+static int fmt_spaces(struct fmt *f, int beg, int end)
 {
-	int l = strlen(s);
-	char *r = malloc(l + 1);
-	memcpy(r, s, l + 1);
-	return r;
+	int i, n = 0;
+	for (i = beg + 1; i < end; i++)
+		if (f->words[i].str)
+			n++;
+	return n;
 }
 
-/* copy word buffer wb in fmt->words[i] */
-static void fmt_insertword(struct fmt *f, int i, struct wb *wb, int gap)
-{
-	struct word *w = &f->words[i];
-	w->s = fmt_strdup(wb_buf(wb));
-	w->wid = wb_wid(wb);
-	w->elsn = wb->els_neg;
-	w->elsp = wb->els_pos;
-	w->gap = gap;
-}
-
-/* try to hyphenate the n-th word */
-static void fmt_hyph(struct fmt *f, int n, int w, int hyph)
-{
-	struct wb w1, w2;
-	int flg = hyph | (n ? 0 : HY_ANY);
-	wb_init(&w1);
-	wb_init(&w2);
-	if (!wb_hyph(f->words[n].s, w, &w1, &w2, flg)) {
-		fmt_movewords(f, n + 2, n + 1, f->nwords - n);
-		free(f->words[n].s);
-		fmt_insertword(f, n, &w1, f->words[n].gap);
-		fmt_insertword(f, n + 1, &w2, 0);
-		f->nwords++;
-	}
-	wb_done(&w1);
-	wb_done(&w2);
-}
-
-/* estimated number of lines until traps or the end of a page */
-static int ren_safelines(void)
-{
-	return f_nexttrap() / (MAX(1, n_L) * n_v);
-}
-
-static int fmt_nlines(struct fmt *f)
-{
-	if (f->l_tail <= f->l_head)
-		return f->l_head - f->l_tail;
-	return NLINES - f->l_tail + f->l_head;
-}
-
 /* return the next line in the buffer */
 int fmt_nextline(struct fmt *f, struct sbuf *sbuf, int *w,
 		int *li, int *ll, int *els_neg, int *els_pos)
@@ -229,6 +197,38 @@
 	}
 }
 
+/* copy word buffer wb in fmt->words[i] */
+static void fmt_insertword(struct fmt *f, struct wb *wb, int gap)
+{
+	int hyidx[NHYPHS];
+	int hywid[NHYPHS];
+	int hydash[NHYPHS];
+	struct word *w;
+	char *beg, *end;
+	char *src = wb_buf(wb);
+	int n, i;
+	n = wb_hyph(src, hyidx, hywid, hydash, n_hy);
+	for (i = 0; i <= n; i++) {
+		w = &f->words[f->nwords++];
+		beg = src + (i > 0 ? hyidx[i - 1] : 0);
+		end = src + (i < n ? hyidx[i] : strlen(src));
+		w->s = malloc(end - beg + 1);
+		memcpy(w->s, beg, end - beg);
+		w->s[end - beg] = '\0';
+		if (n) {
+			w->wid = (i < n ? hywid[i] : wb_wid(wb)) -
+				(i > 0 ? hywid[i - 1] : 0);
+		} else {
+			w->wid = wb_wid(wb);
+		}
+		w->elsn = wb->els_neg;
+		w->elsp = wb->els_pos;
+		w->hy = i < n ? hydash[i] : 0;
+		w->str = i == 0;
+		w->gap = i == 0 ? gap : 0;
+	}
+}
+
 /* insert wb into fmt */
 void fmt_word(struct fmt *f, struct wb *wb)
 {
@@ -242,18 +242,23 @@
 		fmt_confupdate(f);
 	if (f->nls && !f->gap && f->nwords >= 1)
 		f->gap = (f->nwords && f->eos) ? FMT_SWID(f) * 2 : FMT_SWID(f);
-	fmt_insertword(f, f->nwords++, wb, f->filled ? 0 : f->gap);
+	f->eos = wb_eos(wb);
+	fmt_insertword(f, wb, f->filled ? 0 : f->gap);
 	f->filled = 0;
 	f->nls = 0;
 	f->gap = 0;
-	f->eos = wb_eos(wb);
 }
 
+/* assuming an empty line has cost 10000; take care of integer overflow */
+#define POW2(x)				((x) * (x))
+#define FMT_COST(lwid, llen, pen)	(POW2(((llen) - (lwid)) * 1000l / (llen)) / 100l + (pen) * 10l)
+
 /* the cost of putting a line break before word pos */
 static long fmt_findcost(struct fmt *f, int pos)
 {
-	int i, w;
+	int i, pen = 0;
 	long cur;
+	int lwid = 0;
 	int llen = FMT_LLEN(f);
 	if (pos <= 0)
 		return 0;
@@ -260,14 +265,18 @@
 	if (f->best_pos[pos] >= 0)
 		return f->best[pos];
 	i = pos - 1;
-	w = 0;
+	lwid = 0;
+	if (f->words[i].hy)	/* the last word is hyphenated */
+		lwid += f->words[i].hy;
+	if (f->words[i].hy)
+		pen = n_hyp;
 	while (i >= 0) {
-		w += f->words[i].wid;
+		lwid += f->words[i].wid;
 		if (i + 1 < pos)
-			w += f->words[i + 1].gap;
-		if (w > llen && pos - i > 1)
+			lwid += f->words[i + 1].gap;
+		if (lwid > llen && pos - i > 1)
 			break;
-		cur = fmt_findcost(f, i) + (llen - w) * (llen - w);
+		cur = fmt_findcost(f, i) + FMT_COST(lwid, llen, pen);
 		if (f->best_pos[pos] < 0 || cur < f->best[pos]) {
 			f->best_pos[pos] = i;
 			f->best[pos] = cur;
@@ -277,7 +286,6 @@
 	return f->best[pos];
 }
 
-/* the best position for breaking the line ending at pos */
 static int fmt_bestpos(struct fmt *f, int pos)
 {
 	fmt_findcost(f, pos);
@@ -287,26 +295,28 @@
 /* return the last filled word */
 static int fmt_breakparagraph(struct fmt *f, int pos, int all)
 {
-	int i, w;
-	long cur, best = 0;
+	int i;
+	long best = 0;
 	int best_i = -1;
 	int llen = FMT_LLEN(f);
+	int lwid = 0;
 	if (all || (pos > 0 && f->words[pos - 1].wid >= llen)) {
 		fmt_findcost(f, pos);
 		return pos;
 	}
 	i = pos - 1;
-	w = 0;
+	lwid = 0;
+	if (f->words[i].hy)	/* the last word is hyphenated */
+		lwid += f->words[i].hy;
 	while (i >= 0) {
-		w += f->words[i].wid;
+		lwid += f->words[i].wid;
 		if (i + 1 < pos)
-			w += f->words[i + 1].gap;
-		if (w > llen && pos - i > 1)
+			lwid += f->words[i + 1].gap;
+		if (lwid > llen && pos - i > 1)
 			break;
-		cur = fmt_findcost(f, i);
-		if (best_i < 0 || cur < best) {
+		if (best_i < 0 || fmt_findcost(f, i) < best) {
 			best_i = i;
-			best = cur;
+			best = fmt_findcost(f, i);
 		}
 		i--;
 	}
@@ -317,7 +327,7 @@
 static int fmt_break(struct fmt *f, int end)
 {
 	int llen, fmt_div, fmt_rem, beg;
-	int n, w, i;
+	int w, i, nspc;
 	struct line *l;
 	int ret = 0;
 	beg = fmt_bestpos(f, end);
@@ -329,17 +339,18 @@
 	llen = FMT_LLEN(f);
 	f->words[beg].gap = 0;
 	w = fmt_wordslen(f, beg, end);
-	n = end - beg;
-	if (FMT_ADJ(f) && n > 1) {
-		fmt_div = (llen - w) / (n - 1);
-		fmt_rem = (llen - w) % (n - 1);
+	nspc = fmt_spaces(f, beg, end);
+	if (FMT_ADJ(f) && nspc) {
+		fmt_div = (llen - w) / nspc;
+		fmt_rem = (llen - w) % nspc;
 		for (i = beg + 1; i < end; i++)
-			f->words[i].gap += fmt_div + (i < fmt_rem);
+			if (f->words[i].str)
+				f->words[i].gap += fmt_div + (fmt_rem-- > 0);
 	}
 	l->wid = fmt_wordscopy(f, beg, end, &l->sbuf, &l->elsn, &l->elsp);
 	if (beg > 0)
 		fmt_confupdate(f);
-	return ret + n;
+	return ret + (end - beg);
 }
 
 int fmt_fill(struct fmt *f, int all)
--- a/reg.c
+++ b/reg.c
@@ -35,7 +35,7 @@
 	".L", ".nI", ".nm", ".nM", ".nn",
 	".nS", ".m", ".s", ".u", ".v",
 	".it", ".itn", ".mc", ".mcn",
-	".ce", ".f0", ".hy", ".i0", ".l0",
+	".ce", ".f0", ".hy", ".hyp", ".i0", ".l0",
 	".L0", ".m0", ".n0", ".s0", ".ss",
 	".ti", ".lt", ".lt0", ".v0",
 };
--- a/roff.h
+++ b/roff.h
@@ -34,7 +34,7 @@
 #define RNLEN		NMLEN	/* register/macro name */
 #define ILNLEN		1000	/* line limit of input files */
 #define LNLEN		4000	/* line buffer length (ren.c/out.c) */
-#define NWORDS		512	/* number of queued words in formatting buffer */
+#define NWORDS		1024	/* number of queued words in formatting buffer */
 #define NLINES		32	/* number of queued lines in formatting buffer */
 #define NARGS		16	/* number of macro arguments */
 #define NPREV		16	/* environment stack depth */
@@ -47,6 +47,7 @@
 #define MAXFRAC		100000	/* maximum value of the fractional part */
 #define LIGLEN		4	/* length of ligatures */
 #define NCDEFS		128	/* number of character definitions (.char) */
+#define NHYPHS		8	/* maximum hyphenations per word */
 
 /* converting scales */
 #define SC_IN		(dev_res)	/* inch in units */
@@ -265,7 +266,7 @@
 void wb_italiccorrection(struct wb *wb);
 void wb_italiccorrectionleft(struct wb *wb);
 void wb_cat(struct wb *wb, struct wb *src);
-int wb_hyph(char *word, int w, struct wb *w1, struct wb *w2, int flg);
+int wb_hyph(char *word, int *hyidx, int *hywid, int *hydash, int flg);
 int wb_wid(struct wb *wb);
 int wb_empty(struct wb *wb);
 int wb_eos(struct wb *wb);
@@ -284,11 +285,9 @@
 int cdef_expand(struct wb *wb, char *c, int fn);
 
 /* hyphenation flags */
-#define HY_MASK		0x0f	/* enable hyphenation */
 #define HY_LAST		0x02	/* do not hyphenate last lines */
 #define HY_FINAL2	0x04	/* do not hyphenate the final two characters */
 #define HY_FIRST2	0x08	/* do not hyphenate the first two characters */
-#define HY_ANY		0x10	/* break at any possible position */
 
 void hyphenate(char *hyphs, char *word, int flg);
 
@@ -449,6 +448,7 @@
 #define n_f0		(*nreg(map(".f0")))	/* last .f */
 #define n_lg		(*nreg(map(".lg")))	/* .lg mode */
 #define n_hy		(*nreg(map(".hy")))	/* .hy mode */
+#define n_hyp		(*nreg(map(".hyp")))	/* hyphenation penalty  */
 #define n_i0		(*nreg(map(".i0")))	/* last .i */
 #define n_ti		(*nreg(map(".ti")))	/* pending .ti */
 #define n_kn		(*nreg(map(".kern")))	/* .kn mode */
--- a/tr.c
+++ b/tr.c
@@ -395,6 +395,11 @@
 	n_hy = args[1] ? atoi(args[1]) : 1;
 }
 
+static void tr_hyp(char **args)
+{
+	n_hyp = args[1] ? atoi(args[1]) : 1;
+}
+
 static void tr_lg(char **args)
 {
 	if (args[1])
@@ -857,6 +862,7 @@
 	{"ft", tr_ft},
 	{"hc", tr_hc},
 	{"hy", tr_hy},
+	{"hyp", tr_hyp},
 	{"hw", tr_hw},
 	{"ie", tr_if, mkargs_null},
 	{"if", tr_if, mkargs_null},
--- a/wb.c
+++ b/wb.c
@@ -427,24 +427,6 @@
 	return 0;
 }
 
-/* the position marked with hyphens or \: */
-static char *bp_pos(char *s, int w, struct wb *w1, int flg)
-{
-	char d[ILNLEN];
-	char *r = NULL;
-	int c;
-	skipreqs(&s, w1);
-	while ((c = escread(&s, d)) >= 0) {
-		wb_putc(w1, c, d);
-		if (wb_wid(w1) > w && (!(flg & HY_ANY) || r))
-			continue;
-		if (!c && (!strcmp("-", d) || (!strcmp("em", d) ||
-					!strcmp("hy", d)) || !strcmp(c_bp, d)))
-			r = s;
-	}
-	return r;
-}
-
 static int wb_dashwid(struct wb *wb)
 {
 	struct glyph *g = dev_glyph("hy", R_F(wb));
@@ -451,45 +433,63 @@
 	return charwid(R_F(wb), R_S(wb), g ? g->wid : 0);
 }
 
-/* the position marked with \% */
-static char *hc_pos(char *s, int w, struct wb *w1, int flg)
+/* find the positions marked with dashes, hyphens or \: */
+static int dashpos(char *word, int *hyidx, int *hywid, int *hydash, int flg)
 {
 	char d[ILNLEN];
-	char *r = NULL;
+	struct wb wb;
+	char *s = word;
+	int n = 0;
 	int c;
-	skipreqs(&s, w1);
-	while ((c = escread(&s, d)) >= 0) {
-		wb_putc(w1, c, d);
-		if (wb_wid(w1) + wb_dashwid(w1) > w && (!(flg & HY_ANY) || r))
+	wb_init(&wb);
+	skipreqs(&s, &wb);
+	while ((c = escread(&s, d)) >= 0 && n < NHYPHS) {
+		wb_putc(&wb, c, d);
+		if (c)
 			continue;
-		if (!c && !strcmp(c_hc, d))
-			r = s;
+		hyidx[n] = s - word;
+		hywid[n] = wb_wid(&wb);
+		hydash[n] = 0;
+		if (!strcmp("-", d) || (!strcmp("em", d) ||
+				!strcmp("hy", d)) || !strcmp(c_bp, d)) {
+			n++;
+		}
+		if (!strcmp(c_hc, d)) {
+			hydash[n] = wb_dashwid(&wb);
+			n++;
+		}
 	}
-	return r;
+	wb_done(&wb);
+	return n;
 }
 
-static char *hyphpos(char *s, int w, struct wb *w1, int flg)
+static int hyphpos(char *src, int *hyidx, int *hywid, int *hydash, int flg)
 {
 	char word[ILNLEN];	/* word to pass to hyphenate() */
 	char hyph[ILNLEN];	/* hyphenation points returned from hyphenate() */
 	char *iw[ILNLEN];	/* beginning of i-th char in word */
 	char *is[ILNLEN];	/* beginning of i-th char in s */
-	int fits[ILNLEN];	/* fits[i] is 1, if the first i chars fit w */
 	int n = 0;		/* the number of characters in word */
+	int sw[ILNLEN];		/* dash width at i-th char in word */
+	int dw[ILNLEN];		/* dash width at i-th char in word */
+	int nhy = 0;		/* number of hyphenations found */
 	char d[ILNLEN];
+	struct wb wb;
+	char *s = src;
 	char *prev_s = s;
-	char *r = NULL;
 	char *wp = word, *we = word + sizeof(word);
 	int i, c;
-	skipreqs(&s, w1);
+	wb_init(&wb);
+	skipreqs(&s, &wb);
 	while ((c = escread(&s, d)) >= 0 && (c > 0 || strlen(d) + 1 < we - wp)) {
-		fits[n] = wb_wid(w1) + wb_dashwid(w1) <= w;
-		wb_putc(w1, c, d);
+		wb_putc(&wb, c, d);
 		if (c == 0) {
 			iw[n] = wp;
 			is[n] = prev_s;
+			dw[n] = wb_dashwid(&wb);
+			sw[n] = wb_wid(&wb);
 			/* ignore multi-char aliases except for ligatures */
-			if (!utf8one(d) && !font_islig(dev_font(R_F(w1)), d))
+			if (!utf8one(d) && !font_islig(dev_font(R_F(&wb)), d))
 				strcpy(d, ".");
 			strcpy(wp, d);
 			wp = strchr(wp, '\0');
@@ -497,50 +497,35 @@
 		}
 		prev_s = s;
 	}
+	wb_done(&wb);
 	if (n < 3)
-		return NULL;
+		return 0;
 	hyphenate(hyph, word, flg);
-	for (i = 1; i < n - 1; i++)
-		if (hyph[iw[i] - word] && (fits[i] || ((flg & HY_ANY) && !r)))
-			r = is[i];
-	return r;
+	for (i = 1; i < n - 1 && nhy < NHYPHS; i++) {
+		if (hyph[iw[i] - word]) {
+			hyidx[nhy] = is[i] - src;
+			hywid[nhy] = sw[i - 1];
+			hydash[nhy] = dw[i];
+			nhy++;
+		}
+	}
+	return nhy;
 }
 
-static void dohyph(char *s, char *pos, int dash, struct wb *w1, struct wb *w2)
+int wb_hyph(char *word, int *hyidx, int *hywid, int *hydash, int flg)
 {
-	char d[ILNLEN];
-	int c = -1;
-	wb_reset(w1);
-	wb_reset(w2);
-	while (s != pos && (c = escread(&s, d)) >= 0)
-		wb_putc(w1, c, d);
-	if (dash)
-		wb_putc(w1, 0, "hy");
-	w2->r_s = w1->r_s;
-	w2->r_f = w1->r_f;
-	w2->r_m = w1->r_m;
-	while ((c = escread(&s, d)) >= 0)
-		wb_putc(w2, c, d);
-}
-
-/* hyphenate wb into w1 and w2; return zero on success */
-int wb_hyph(char *word, int w, struct wb *w1, struct wb *w2, int flg)
-{
 	char *s = word;
-	char *dp, *hp, *p;
-	if (skipreqs(&s, w1))
-		return 1;
-	dp = bp_pos(word, w, w1, flg);
-	hp = hc_pos(word, w, w1, flg);
-	if (hp && dp)
-		p = flg & HY_ANY ? MIN(dp, hp) : MAX(dp, hp);
-	else
-		p = dp ? dp : hp;
-	if (!p && flg & HY_MASK)
-		p = hyphpos(word, w, w1, flg);
-	if (p)
-		dohyph(word, p, p != dp, w1, w2);
-	return !p;
+	struct wb wb;
+	int n;
+	wb_init(&wb);
+	if (skipreqs(&s, &wb)) {
+		wb_done(&wb);
+		return 0;
+	}
+	wb_done(&wb);
+	if ((n = dashpos(word, hyidx, hywid, hydash, flg)))
+		return n;
+	return flg ? hyphpos(word, hyidx, hywid, hydash, flg) : 0;
 }
 
 void wb_italiccorrection(struct wb *wb)