shithub: rgbds

Download patch

ref: 81a77a9b884eb3208ff6c9a38062e184301ae1f6
parent: 6e805cd3182390d756dc973dd61f2d31a476cdb1
author: ISSOtm <[email protected]>
date: Sat Aug 1 13:50:59 EDT 2020

Re-implement block copy to avoid expanding macro args

They were expanded during the capture, and there was no easy way to
avoid expanding them (believe me, after three hours and somehow an OOM, I
gave up trying).

--- a/include/asm/lexer.h
+++ b/include/asm/lexer.h
@@ -51,7 +51,7 @@
 uint32_t lexer_GetColNo(void);
 void lexer_DumpStringExpansions(void);
 int yylex(void);
-void lexer_CaptureBlock(int blockStartToken, int blockEndToken, char **capture, size_t *size,
-			char const *name);
+void lexer_CaptureRept(char **capture, size_t *size);
+void lexer_CaptureMacroBody(char **capture, size_t *size);
 
 #endif /* RGBDS_ASM_LEXER_H */
--- a/src/asm/asmy.y
+++ b/src/asm/asmy.y
@@ -599,9 +599,8 @@
 			uint32_t nDefinitionLineNo = lexer_GetLineNo();
 			char *body;
 			size_t size;
-			lexer_CaptureBlock(T_POP_REPT, T_POP_ENDR, &body, &size,
-					   "REPT block");
-			fstk_RunRept($2, nDefinitionLineNo, body, size - strlen("ENDR"));
+			lexer_CaptureRept(&body, &size);
+			fstk_RunRept($2, nDefinitionLineNo, body, size);
 		}
 ;
 
@@ -609,9 +608,8 @@
 			int32_t nDefinitionLineNo = lexer_GetLineNo();
 			char *body;
 			size_t size;
-			lexer_CaptureBlock(T_POP_MACRO, T_POP_ENDM, &body, &size,
-					   "macro definition");
-			sym_AddMacro($1, nDefinitionLineNo, body, size - strlen("ENDM"));
+			lexer_CaptureMacroBody(&body, &size);
+			sym_AddMacro($1, nDefinitionLineNo, body, size);
 		}
 ;
 
--- a/src/asm/lexer.c
+++ b/src/asm/lexer.c
@@ -626,19 +626,19 @@
 
 		/* Do not perform expansions while capturing */
 		if (!lexerState->capturing) {
-			/* Scan the newly-inserted chars for any macro args */
-			bool escaped = false;
+			/* Scan the new chars for any macro args */
+#define BUF_OFS (lexerState->offset + lexerState->nbChars)
+			while (lexerState->nbChars <= distance) {
+				char c = lexerState->ptr[BUF_OFS];
 
-			while (lexerState->nbChars < distance && !escaped) {
-				char c = lexerState->ptr[lexerState->offset
-							 + lexerState->nbChars++];
-
-				if (escaped) {
-					escaped = false;
+				lexerState->nbChars++;
+				if (c == '\\') {
+					if (lexerState->size <= BUF_OFS)
+						break; /* This was the last char in the buffer */
+					c = lexerState->ptr[BUF_OFS];
+					lexerState->nbChars++;
 					if ((c >= '1' && c <= '9') || c == '@')
 						fatalerror("Macro arg expansion is not implemented yet\n");
-				} else if (c == '\\') {
-					escaped = true;
 				}
 			}
 		}
@@ -774,7 +774,6 @@
 	if (lexerState->isMmapped) {
 		lexerState->offset += distance;
 	} else {
-		lexerState->nbChars -= distance;
 		lexerState->index += distance;
 		/* Wrap around if necessary */
 		if (lexerState->index >= LEXER_BUF_SIZE)
@@ -781,6 +780,7 @@
 			lexerState->index %= LEXER_BUF_SIZE;
 	}
 
+	lexerState->nbChars -= distance;
 	lexerState->colNo += distance;
 }
 
@@ -975,6 +975,11 @@
 
 /* Function to read identifiers & keywords */
 
+static bool startsIdentifier(int c)
+{
+	return (c <= 'Z' && c >= 'A') || (c <= 'z' && c >= 'a') || c == '.' || c == '_';
+}
+
 static int readIdentifier(char firstChar)
 {
 	/* Lex while checking for a keyword */
@@ -1449,9 +1454,7 @@
 		/* Handle identifiers... or error out */
 
 		default:
-			if ((c <= 'Z' && c >= 'A')
-			 || (c <= 'z' && c >= 'a')
-			 || c == '.' || c == '_') {
+			if (startsIdentifier(c)) {
 				int tokenType = readIdentifier(c);
 
 				/* If a keyword, don't try to expand */
@@ -1672,39 +1675,163 @@
 	return token;
 }
 
-void lexer_CaptureBlock(int blockStartToken, int blockEndToken, char **capture, size_t *size,
-			char const *name)
+static char *startCapture(void)
 {
 	assert(!lexerState->expansions);
 
 	lexerState->capturing = true;
 	lexerState->captureSize = 0;
-	unsigned int level = 0;
-	char *captureStart;
 
 	if (lexerState->isMmapped) {
-		captureStart = &lexerState->ptr[lexerState->offset];
+		return &lexerState->ptr[lexerState->offset];
 	} else {
 		lexerState->captureCapacity = 128; /* The initial size will be twice that */
 		reallocCaptureBuf();
-		captureStart = lexerState->captureBuf;
+		return lexerState->captureBuf;
 	}
+}
 
+void lexer_CaptureRept(char **capture, size_t *size)
+{
+	char *captureStart = startCapture();
+	unsigned int level = 0;
+	int c;
+
+	/*
+	 * Due to parser internals, it reads the EOL after the expression before calling this.
+	 * Thus, we don't need to keep one in the buffer afterwards.
+	 * The following assertion checks that.
+	 */
+	assert(lexerState->atLineStart);
 	for (;;) {
-		int token = yylex();
+		/* We're at line start, so attempt to match a `REPT` or `ENDR` token */
+		do { /* Discard initial whitespace */
+			c = nextChar();
+		} while (isWhitespace(c));
+		/* Now, try to match either `REPT` or `ENDR` as a **whole** identifier */
+		if (startsIdentifier(c)) {
+			switch (readIdentifier(c)) {
+			case T_POP_REPT:
+				level++;
+				/* Ignore the rest of that line */
+				break;
 
-		if (level == 0 && token == blockEndToken)
-			break;
+			case T_POP_ENDR:
+				if (!level) {
+					/* Read (but don't capture) until EOL or EOF */
+					lexerState->capturing = false;
+					do {
+						c = nextChar();
+					} while (c != EOF && c != '\r' && c != '\n');
+					/* Handle Windows CRLF */
+					if (c == '\r' && peek(0) == '\n')
+						shiftChars(1);
+					goto finish;
+				}
+				level--;
+			}
+		}
+		lexerState->lineNo++;
 
-		if (token == EOF)
-			error("Unterminated %s\n", name);
-		else if (token == blockStartToken)
-			level++;
-		else if (token == blockEndToken)
-			level--;
+		/* Just consume characters until EOL or EOF */
+		for (;;) {
+			if (c == EOF) {
+				error("Unterminated REPT block\n");
+				goto finish;
+			} else if (c == '\n') {
+				break;
+			} else if (c == '\r') {
+				if (peek(0) == '\n')
+					shiftChars(1);
+				break;
+			}
+			c = nextChar();
+		}
 	}
 
+finish:
 	*capture = captureStart;
-	*size = lexerState->captureSize;
+	*size = lexerState->captureSize - strlen("ENDR");
+	lexerState->captureBuf = NULL;
+}
+
+void lexer_CaptureMacroBody(char **capture, size_t *size)
+{
+	char *captureStart = startCapture();
+	unsigned int level = 0;
+	int c = peek(0);
+
+	/*
+	 * Due to parser internals, it does not read the EOL after the T_POP_MACRO before calling
+	 * this. Thus, we need to keep one in the buffer afterwards.
+	 * (Note that this also means the captured buffer begins with a newline and maybe comment)
+	 * The following assertion checks that.
+	 */
+	assert(!lexerState->atLineStart);
+	for (;;) {
+		/* Just consume characters until EOL or EOF */
+		for (;;) {
+			if (c == EOF) {
+				error("Unterminated macro definition\n");
+				goto finish;
+			} else if (c == '\n') {
+				break;
+			} else if (c == '\r') {
+				if (peek(0) == '\n')
+					shiftChars(1);
+				break;
+			}
+			c = nextChar();
+		}
+
+		/* We're at line start, attempt to match a `label: MACRO` line or `ENDM` token */
+		do { /* Discard initial whitespace */
+			c = nextChar();
+		} while (isWhitespace(c));
+		/* Now, try to match either `REPT` or `ENDR` as a **whole** identifier */
+		if (startsIdentifier(c)) {
+			switch (readIdentifier(c)) {
+			case T_ID:
+				/* We have an initial label, look for a single colon */
+				do {
+					c = nextChar();
+				} while (isWhitespace(c));
+				if (c != ':') /* If not a colon, give up */
+					break;
+				/* And finally, a `MACRO` token */
+				do {
+					c = nextChar();
+				} while (isWhitespace(c));
+				if (!startsIdentifier(c))
+					break;
+				if (readIdentifier(c) != T_POP_MACRO)
+					break;
+				level++;
+				break;
+
+			case T_POP_ENDM:
+				if (!level) {
+					/* Read (but don't capture) until EOL or EOF */
+					lexerState->capturing = false;
+					do {
+						c = peek(0);
+						if (c == EOF || c == '\r' || c == '\n')
+							break;
+						shiftChars(1);
+					} while (c != EOF && c != '\r' && c != '\n');
+					/* Handle Windows CRLF */
+					if (c == '\r' && peek(1) == '\n')
+						shiftChars(1);
+					goto finish;
+				}
+				level--;
+			}
+		}
+		lexerState->lineNo++;
+	}
+
+finish:
+	*capture = captureStart;
+	*size = lexerState->captureSize - strlen("ENDM");
 	lexerState->captureBuf = NULL;
 }