shithub: fork

ref: 59fe51f5cbb81a135734c693ab51a5b243534c92
dir: /sys/src/cmd/upas/bayes/regen.c/

View raw version
#include <u.h>
#include <libc.h>
#include <bio.h>
#include "regexp.h"
#include "dfa.h"

/***
 * Regular expression for matching.
 */

char *ignore[] = 
{
	/* HTML that isn't A, IMG, or FONT */
	/* Must have a space somewhere to avoid catching <email@address> */
	"<[ 	\n\r]*("
		"[^aif]|"
		"a[^> \t\r\n]|"
		"i[^mM \t\r\n]|"
		"im[^gG \t\r\n]|"
		"img[^> \t\r\n]|"
		"f[^oO \t\r\n]|"
		"fo[^Nn \t\r\n]|"
		"fon[^tT \t\r\n]|"
		"font[^> \r\t\n]"
	")[^>]*[ \t\n\r][^>]*>",
	"<[ 	\n\r]*("
		"i|im|f|fo|fon"
	")[ \t\r\n][^>]*>",

	/* ignore html comments */
	"<!--([^\\-]|-[^\\-]|--[^>]|\n)*-->",

	/* random mail strings */
	"^message-id:.*\n([ 	].*\n)*",
	"^in-reply-to:.*\n([ 	].*\n)*",
	"^references:.*\n([ 	].*\n)*",
	"^date:.*\n([ 	].*\n)*",
	"^delivery-date:.*\n([ 	].*\n)*",
	"e?smtp id .*",
	"^	id.*",
	"boundary=.*",
	"name=\"",
	"filename=\"",
	"news:<[^>]+>",
	"^--[^ 	]*$",

	/* base64 encoding */
	"^[0-9a-zA-Z+\\-=/]+$",

	/* uu encoding */
	"^[!-Z]+$",

	/* little things */
	".",
	"\n",
};

char *keywords[] =
{
	"([a-zA-Z'`$!¡-￿]|[0-9]([.,][0-9])*)+",
};

int debug;

Dreprog*
dregcomp(char *buf)
{
	Reprog *r;
	Dreprog *d;

	if(debug)
		print(">>> '%s'\n", buf);

	r = regcomp(buf);
	if(r == nil)
		sysfatal("regcomp");
	d = dregcvt(r);
	if(d == nil)
		sysfatal("dregcomp");
	free(r);
	return d;
}

char*
strcpycase(char *d, char *s)
{
	int cc, esc;

	cc = 0;
	esc = 0;
	while(*s){
		if(*s == '[')
			cc++;
		if(*s == ']')
			cc--;
		if(!cc && 'a' <= *s && *s <= 'z'){
			*d++ = '[';
			*d++ = *s;
			*d++ = *s+'A'-'a';
			*d++ = ']';
		}else
			*d++ = *s;
		if(*s == '\\')
			esc++;
		else if(esc)
			esc--;
		s++;
	}
	return d;
}

void
regerror(char *msg)
{
	sysfatal("regerror: %s", msg);
}

void
buildre(Dreprog *re[3])
{
	int i;
	static char buf[16384], *s;

	re[0] = dregcomp("^From ");
	
	s = buf;
	for(i=0; i<nelem(keywords); i++){
		if(i != 0)
			*s++ = '|';
		s = strcpycase(s, keywords[i]);
	}
	*s = 0;
	re[1] = dregcomp(buf);

	s = buf;
	for(i=0; i<nelem(ignore); i++){
		if(i != 0)
			*s++ = '|';
		s = strcpycase(s, ignore[i]);
	}
	*s = 0;
	re[2] = dregcomp(buf);
}

void
usage(void)
{
	fprint(2, "usage: regen [-d]\n");
	exits("usage");
}

void
main(int argc, char **argv)
{
	Dreprog *re[3];
	Biobuf b;

	ARGBEGIN{
	default:
		usage();
	case 'd':
		debug = 1;
	}ARGEND

	if(argc != 0)
		usage();

	buildre(re);
	Binit(&b, 1, OWRITE);
	Bprintdfa(&b, re[0]);
	Bprintdfa(&b, re[1]);
	Bprintdfa(&b, re[2]);
	exits(0);
}