ref: 5d20e93d9fcd37af03128fc712b282c7e9993b98
parent: 0065e9a53fb42e3116a469d44610c3c39d48c834
author: Ali Gholami Rudi <[email protected]>
date: Wed Apr 30 11:59:09 EDT 2014
fmt: new hyphenation support with penalties The .hyp request can specify the hyphenation penalty. An argument of 1000 assigns the cost of formatting an empty line to each hyphenation; a value between 0 and 100 seems feasible for common line lengths.
--- a/fmt.c
+++ b/fmt.c
@@ -27,6 +27,8 @@
int wid; /* word's width */
int elsn, elsp; /* els_neg and els_pos */
int gap; /* the space before this word */
+ int hy; /* hyphen width if inserted after this word */
+ int str; /* does the spece before it stretch */
};
struct line {
@@ -92,6 +94,12 @@
*els_pos = wcur->elsp;
free(wcur->s);
}
+ if (beg < end) {
+ wcur = &f->words[end - 1];
+ if (wcur->hy)
+ sbuf_append(s, "\\(hy");
+ w += wcur->hy;
+ }
return w;
}
@@ -101,59 +109,19 @@
int i, w = 0;
for (i = beg; i < end; i++)
w += f->words[i].wid + f->words[i].gap;
- return w;
+ return beg < end ? w + f->words[end - 1].hy : 0;
}
-static char *fmt_strdup(char *s)
+/* the number stretchable spaces in f */
+static int fmt_spaces(struct fmt *f, int beg, int end)
{
- int l = strlen(s);
- char *r = malloc(l + 1);
- memcpy(r, s, l + 1);
- return r;
+ int i, n = 0;
+ for (i = beg + 1; i < end; i++)
+ if (f->words[i].str)
+ n++;
+ return n;
}
-/* copy word buffer wb in fmt->words[i] */
-static void fmt_insertword(struct fmt *f, int i, struct wb *wb, int gap)
-{
- struct word *w = &f->words[i];
- w->s = fmt_strdup(wb_buf(wb));
- w->wid = wb_wid(wb);
- w->elsn = wb->els_neg;
- w->elsp = wb->els_pos;
- w->gap = gap;
-}
-
-/* try to hyphenate the n-th word */
-static void fmt_hyph(struct fmt *f, int n, int w, int hyph)
-{
- struct wb w1, w2;
- int flg = hyph | (n ? 0 : HY_ANY);
- wb_init(&w1);
- wb_init(&w2);
- if (!wb_hyph(f->words[n].s, w, &w1, &w2, flg)) {
- fmt_movewords(f, n + 2, n + 1, f->nwords - n);
- free(f->words[n].s);
- fmt_insertword(f, n, &w1, f->words[n].gap);
- fmt_insertword(f, n + 1, &w2, 0);
- f->nwords++;
- }
- wb_done(&w1);
- wb_done(&w2);
-}
-
-/* estimated number of lines until traps or the end of a page */
-static int ren_safelines(void)
-{
- return f_nexttrap() / (MAX(1, n_L) * n_v);
-}
-
-static int fmt_nlines(struct fmt *f)
-{
- if (f->l_tail <= f->l_head)
- return f->l_head - f->l_tail;
- return NLINES - f->l_tail + f->l_head;
-}
-
/* return the next line in the buffer */
int fmt_nextline(struct fmt *f, struct sbuf *sbuf, int *w,
int *li, int *ll, int *els_neg, int *els_pos)
@@ -229,6 +197,38 @@
}
}
+/* copy word buffer wb in fmt->words[i] */
+static void fmt_insertword(struct fmt *f, struct wb *wb, int gap)
+{
+ int hyidx[NHYPHS];
+ int hywid[NHYPHS];
+ int hydash[NHYPHS];
+ struct word *w;
+ char *beg, *end;
+ char *src = wb_buf(wb);
+ int n, i;
+ n = wb_hyph(src, hyidx, hywid, hydash, n_hy);
+ for (i = 0; i <= n; i++) {
+ w = &f->words[f->nwords++];
+ beg = src + (i > 0 ? hyidx[i - 1] : 0);
+ end = src + (i < n ? hyidx[i] : strlen(src));
+ w->s = malloc(end - beg + 1);
+ memcpy(w->s, beg, end - beg);
+ w->s[end - beg] = '\0';
+ if (n) {
+ w->wid = (i < n ? hywid[i] : wb_wid(wb)) -
+ (i > 0 ? hywid[i - 1] : 0);
+ } else {
+ w->wid = wb_wid(wb);
+ }
+ w->elsn = wb->els_neg;
+ w->elsp = wb->els_pos;
+ w->hy = i < n ? hydash[i] : 0;
+ w->str = i == 0;
+ w->gap = i == 0 ? gap : 0;
+ }
+}
+
/* insert wb into fmt */
void fmt_word(struct fmt *f, struct wb *wb)
{
@@ -242,18 +242,23 @@
fmt_confupdate(f);
if (f->nls && !f->gap && f->nwords >= 1)
f->gap = (f->nwords && f->eos) ? FMT_SWID(f) * 2 : FMT_SWID(f);
- fmt_insertword(f, f->nwords++, wb, f->filled ? 0 : f->gap);
+ f->eos = wb_eos(wb);
+ fmt_insertword(f, wb, f->filled ? 0 : f->gap);
f->filled = 0;
f->nls = 0;
f->gap = 0;
- f->eos = wb_eos(wb);
}
+/* assuming an empty line has cost 10000; take care of integer overflow */
+#define POW2(x) ((x) * (x))
+#define FMT_COST(lwid, llen, pen) (POW2(((llen) - (lwid)) * 1000l / (llen)) / 100l + (pen) * 10l)
+
/* the cost of putting a line break before word pos */
static long fmt_findcost(struct fmt *f, int pos)
{
- int i, w;
+ int i, pen = 0;
long cur;
+ int lwid = 0;
int llen = FMT_LLEN(f);
if (pos <= 0)
return 0;
@@ -260,14 +265,18 @@
if (f->best_pos[pos] >= 0)
return f->best[pos];
i = pos - 1;
- w = 0;
+ lwid = 0;
+ if (f->words[i].hy) /* the last word is hyphenated */
+ lwid += f->words[i].hy;
+ if (f->words[i].hy)
+ pen = n_hyp;
while (i >= 0) {
- w += f->words[i].wid;
+ lwid += f->words[i].wid;
if (i + 1 < pos)
- w += f->words[i + 1].gap;
- if (w > llen && pos - i > 1)
+ lwid += f->words[i + 1].gap;
+ if (lwid > llen && pos - i > 1)
break;
- cur = fmt_findcost(f, i) + (llen - w) * (llen - w);
+ cur = fmt_findcost(f, i) + FMT_COST(lwid, llen, pen);
if (f->best_pos[pos] < 0 || cur < f->best[pos]) {
f->best_pos[pos] = i;
f->best[pos] = cur;
@@ -277,7 +286,6 @@
return f->best[pos];
}
-/* the best position for breaking the line ending at pos */
static int fmt_bestpos(struct fmt *f, int pos)
{
fmt_findcost(f, pos);
@@ -287,26 +295,28 @@
/* return the last filled word */
static int fmt_breakparagraph(struct fmt *f, int pos, int all)
{
- int i, w;
- long cur, best = 0;
+ int i;
+ long best = 0;
int best_i = -1;
int llen = FMT_LLEN(f);
+ int lwid = 0;
if (all || (pos > 0 && f->words[pos - 1].wid >= llen)) {
fmt_findcost(f, pos);
return pos;
}
i = pos - 1;
- w = 0;
+ lwid = 0;
+ if (f->words[i].hy) /* the last word is hyphenated */
+ lwid += f->words[i].hy;
while (i >= 0) {
- w += f->words[i].wid;
+ lwid += f->words[i].wid;
if (i + 1 < pos)
- w += f->words[i + 1].gap;
- if (w > llen && pos - i > 1)
+ lwid += f->words[i + 1].gap;
+ if (lwid > llen && pos - i > 1)
break;
- cur = fmt_findcost(f, i);
- if (best_i < 0 || cur < best) {
+ if (best_i < 0 || fmt_findcost(f, i) < best) {
best_i = i;
- best = cur;
+ best = fmt_findcost(f, i);
}
i--;
}
@@ -317,7 +327,7 @@
static int fmt_break(struct fmt *f, int end)
{
int llen, fmt_div, fmt_rem, beg;
- int n, w, i;
+ int w, i, nspc;
struct line *l;
int ret = 0;
beg = fmt_bestpos(f, end);
@@ -329,17 +339,18 @@
llen = FMT_LLEN(f);
f->words[beg].gap = 0;
w = fmt_wordslen(f, beg, end);
- n = end - beg;
- if (FMT_ADJ(f) && n > 1) {
- fmt_div = (llen - w) / (n - 1);
- fmt_rem = (llen - w) % (n - 1);
+ nspc = fmt_spaces(f, beg, end);
+ if (FMT_ADJ(f) && nspc) {
+ fmt_div = (llen - w) / nspc;
+ fmt_rem = (llen - w) % nspc;
for (i = beg + 1; i < end; i++)
- f->words[i].gap += fmt_div + (i < fmt_rem);
+ if (f->words[i].str)
+ f->words[i].gap += fmt_div + (fmt_rem-- > 0);
}
l->wid = fmt_wordscopy(f, beg, end, &l->sbuf, &l->elsn, &l->elsp);
if (beg > 0)
fmt_confupdate(f);
- return ret + n;
+ return ret + (end - beg);
}
int fmt_fill(struct fmt *f, int all)
--- a/reg.c
+++ b/reg.c
@@ -35,7 +35,7 @@
".L", ".nI", ".nm", ".nM", ".nn",
".nS", ".m", ".s", ".u", ".v",
".it", ".itn", ".mc", ".mcn",
- ".ce", ".f0", ".hy", ".i0", ".l0",
+ ".ce", ".f0", ".hy", ".hyp", ".i0", ".l0",
".L0", ".m0", ".n0", ".s0", ".ss",
".ti", ".lt", ".lt0", ".v0",
};
--- a/roff.h
+++ b/roff.h
@@ -34,7 +34,7 @@
#define RNLEN NMLEN /* register/macro name */
#define ILNLEN 1000 /* line limit of input files */
#define LNLEN 4000 /* line buffer length (ren.c/out.c) */
-#define NWORDS 512 /* number of queued words in formatting buffer */
+#define NWORDS 1024 /* number of queued words in formatting buffer */
#define NLINES 32 /* number of queued lines in formatting buffer */
#define NARGS 16 /* number of macro arguments */
#define NPREV 16 /* environment stack depth */
@@ -47,6 +47,7 @@
#define MAXFRAC 100000 /* maximum value of the fractional part */
#define LIGLEN 4 /* length of ligatures */
#define NCDEFS 128 /* number of character definitions (.char) */
+#define NHYPHS 8 /* maximum hyphenations per word */
/* converting scales */
#define SC_IN (dev_res) /* inch in units */
@@ -265,7 +266,7 @@
void wb_italiccorrection(struct wb *wb);
void wb_italiccorrectionleft(struct wb *wb);
void wb_cat(struct wb *wb, struct wb *src);
-int wb_hyph(char *word, int w, struct wb *w1, struct wb *w2, int flg);
+int wb_hyph(char *word, int *hyidx, int *hywid, int *hydash, int flg);
int wb_wid(struct wb *wb);
int wb_empty(struct wb *wb);
int wb_eos(struct wb *wb);
@@ -284,11 +285,9 @@
int cdef_expand(struct wb *wb, char *c, int fn);
/* hyphenation flags */
-#define HY_MASK 0x0f /* enable hyphenation */
#define HY_LAST 0x02 /* do not hyphenate last lines */
#define HY_FINAL2 0x04 /* do not hyphenate the final two characters */
#define HY_FIRST2 0x08 /* do not hyphenate the first two characters */
-#define HY_ANY 0x10 /* break at any possible position */
void hyphenate(char *hyphs, char *word, int flg);
@@ -449,6 +448,7 @@
#define n_f0 (*nreg(map(".f0"))) /* last .f */
#define n_lg (*nreg(map(".lg"))) /* .lg mode */
#define n_hy (*nreg(map(".hy"))) /* .hy mode */
+#define n_hyp (*nreg(map(".hyp"))) /* hyphenation penalty */
#define n_i0 (*nreg(map(".i0"))) /* last .i */
#define n_ti (*nreg(map(".ti"))) /* pending .ti */
#define n_kn (*nreg(map(".kern"))) /* .kn mode */
--- a/tr.c
+++ b/tr.c
@@ -395,6 +395,11 @@
n_hy = args[1] ? atoi(args[1]) : 1;
}
+static void tr_hyp(char **args)
+{
+ n_hyp = args[1] ? atoi(args[1]) : 1;
+}
+
static void tr_lg(char **args)
{
if (args[1])
@@ -857,6 +862,7 @@
{"ft", tr_ft},
{"hc", tr_hc},
{"hy", tr_hy},
+ {"hyp", tr_hyp},
{"hw", tr_hw},
{"ie", tr_if, mkargs_null},
{"if", tr_if, mkargs_null},
--- a/wb.c
+++ b/wb.c
@@ -427,24 +427,6 @@
return 0;
}
-/* the position marked with hyphens or \: */
-static char *bp_pos(char *s, int w, struct wb *w1, int flg)
-{
- char d[ILNLEN];
- char *r = NULL;
- int c;
- skipreqs(&s, w1);
- while ((c = escread(&s, d)) >= 0) {
- wb_putc(w1, c, d);
- if (wb_wid(w1) > w && (!(flg & HY_ANY) || r))
- continue;
- if (!c && (!strcmp("-", d) || (!strcmp("em", d) ||
- !strcmp("hy", d)) || !strcmp(c_bp, d)))
- r = s;
- }
- return r;
-}
-
static int wb_dashwid(struct wb *wb)
{
struct glyph *g = dev_glyph("hy", R_F(wb));
@@ -451,45 +433,63 @@
return charwid(R_F(wb), R_S(wb), g ? g->wid : 0);
}
-/* the position marked with \% */
-static char *hc_pos(char *s, int w, struct wb *w1, int flg)
+/* find the positions marked with dashes, hyphens or \: */
+static int dashpos(char *word, int *hyidx, int *hywid, int *hydash, int flg)
{
char d[ILNLEN];
- char *r = NULL;
+ struct wb wb;
+ char *s = word;
+ int n = 0;
int c;
- skipreqs(&s, w1);
- while ((c = escread(&s, d)) >= 0) {
- wb_putc(w1, c, d);
- if (wb_wid(w1) + wb_dashwid(w1) > w && (!(flg & HY_ANY) || r))
+ wb_init(&wb);
+ skipreqs(&s, &wb);
+ while ((c = escread(&s, d)) >= 0 && n < NHYPHS) {
+ wb_putc(&wb, c, d);
+ if (c)
continue;
- if (!c && !strcmp(c_hc, d))
- r = s;
+ hyidx[n] = s - word;
+ hywid[n] = wb_wid(&wb);
+ hydash[n] = 0;
+ if (!strcmp("-", d) || (!strcmp("em", d) ||
+ !strcmp("hy", d)) || !strcmp(c_bp, d)) {
+ n++;
+ }
+ if (!strcmp(c_hc, d)) {
+ hydash[n] = wb_dashwid(&wb);
+ n++;
+ }
}
- return r;
+ wb_done(&wb);
+ return n;
}
-static char *hyphpos(char *s, int w, struct wb *w1, int flg)
+static int hyphpos(char *src, int *hyidx, int *hywid, int *hydash, int flg)
{
char word[ILNLEN]; /* word to pass to hyphenate() */
char hyph[ILNLEN]; /* hyphenation points returned from hyphenate() */
char *iw[ILNLEN]; /* beginning of i-th char in word */
char *is[ILNLEN]; /* beginning of i-th char in s */
- int fits[ILNLEN]; /* fits[i] is 1, if the first i chars fit w */
int n = 0; /* the number of characters in word */
+ int sw[ILNLEN]; /* dash width at i-th char in word */
+ int dw[ILNLEN]; /* dash width at i-th char in word */
+ int nhy = 0; /* number of hyphenations found */
char d[ILNLEN];
+ struct wb wb;
+ char *s = src;
char *prev_s = s;
- char *r = NULL;
char *wp = word, *we = word + sizeof(word);
int i, c;
- skipreqs(&s, w1);
+ wb_init(&wb);
+ skipreqs(&s, &wb);
while ((c = escread(&s, d)) >= 0 && (c > 0 || strlen(d) + 1 < we - wp)) {
- fits[n] = wb_wid(w1) + wb_dashwid(w1) <= w;
- wb_putc(w1, c, d);
+ wb_putc(&wb, c, d);
if (c == 0) {
iw[n] = wp;
is[n] = prev_s;
+ dw[n] = wb_dashwid(&wb);
+ sw[n] = wb_wid(&wb);
/* ignore multi-char aliases except for ligatures */
- if (!utf8one(d) && !font_islig(dev_font(R_F(w1)), d))
+ if (!utf8one(d) && !font_islig(dev_font(R_F(&wb)), d))
strcpy(d, ".");
strcpy(wp, d);
wp = strchr(wp, '\0');
@@ -497,50 +497,35 @@
}
prev_s = s;
}
+ wb_done(&wb);
if (n < 3)
- return NULL;
+ return 0;
hyphenate(hyph, word, flg);
- for (i = 1; i < n - 1; i++)
- if (hyph[iw[i] - word] && (fits[i] || ((flg & HY_ANY) && !r)))
- r = is[i];
- return r;
+ for (i = 1; i < n - 1 && nhy < NHYPHS; i++) {
+ if (hyph[iw[i] - word]) {
+ hyidx[nhy] = is[i] - src;
+ hywid[nhy] = sw[i - 1];
+ hydash[nhy] = dw[i];
+ nhy++;
+ }
+ }
+ return nhy;
}
-static void dohyph(char *s, char *pos, int dash, struct wb *w1, struct wb *w2)
+int wb_hyph(char *word, int *hyidx, int *hywid, int *hydash, int flg)
{
- char d[ILNLEN];
- int c = -1;
- wb_reset(w1);
- wb_reset(w2);
- while (s != pos && (c = escread(&s, d)) >= 0)
- wb_putc(w1, c, d);
- if (dash)
- wb_putc(w1, 0, "hy");
- w2->r_s = w1->r_s;
- w2->r_f = w1->r_f;
- w2->r_m = w1->r_m;
- while ((c = escread(&s, d)) >= 0)
- wb_putc(w2, c, d);
-}
-
-/* hyphenate wb into w1 and w2; return zero on success */
-int wb_hyph(char *word, int w, struct wb *w1, struct wb *w2, int flg)
-{
char *s = word;
- char *dp, *hp, *p;
- if (skipreqs(&s, w1))
- return 1;
- dp = bp_pos(word, w, w1, flg);
- hp = hc_pos(word, w, w1, flg);
- if (hp && dp)
- p = flg & HY_ANY ? MIN(dp, hp) : MAX(dp, hp);
- else
- p = dp ? dp : hp;
- if (!p && flg & HY_MASK)
- p = hyphpos(word, w, w1, flg);
- if (p)
- dohyph(word, p, p != dp, w1, w2);
- return !p;
+ struct wb wb;
+ int n;
+ wb_init(&wb);
+ if (skipreqs(&s, &wb)) {
+ wb_done(&wb);
+ return 0;
+ }
+ wb_done(&wb);
+ if ((n = dashpos(word, hyidx, hywid, hydash, flg)))
+ return n;
+ return flg ? hyphpos(word, hyidx, hywid, hydash, flg) : 0;
}
void wb_italiccorrection(struct wb *wb)