ref: 850c78aaf47a167e64a01aa1dc07bbe558a7c282
parent: c08cf783c87e1e6eb48d73fe2701c121b76b6b7f
author: Rangi <[email protected]>
date: Fri Apr 9 10:05:43 EDT 2021
Report garbage chars as their bytes; don't try decoding them as UTF-8 This decoding required high lookahead, and was not even consistently useful (the `garbage_char` test case was not valid UTF-8 and so did not benefit from `reportGarbageChar`). This limits UTF-8 handling to the `STRLEN` and `STRSUB` built-in functions, and to charmap conversion.
--- a/src/asm/lexer.c
+++ b/src/asm/lexer.c
@@ -1818,55 +1818,6 @@
return i;
}
-/* Function to report one character's worth of garbage bytes */
-
-static char const *reportGarbageChar(unsigned char firstByte)
-{
- static char bytes[6 + 2 + 1]; /* Max size of a UTF-8 encoded code point, plus "''\0" */
- /* First, attempt UTF-8 decoding */
- uint32_t state = 0; /* UTF8_ACCEPT */
- uint32_t codepoint;
- uint8_t size = 0; /* Number of additional bytes to shift */
-
- bytes[1] = firstByte; /* No need to init the rest of the array */
- decode(&state, &codepoint, firstByte);
- while (state != 0 && state != 1 /* UTF8_REJECT */) {
- int c = peek(size++);
-
- if (c == EOF)
- break;
- bytes[size + 1] = c;
- decode(&state, &codepoint, c);
- }
-
- if (state == 0 && (codepoint > UCHAR_MAX || isprint((unsigned char)codepoint))) {
- /* Character is valid, printable UTF-8! */
- shiftChars(size);
- bytes[0] = '\'';
- bytes[size + 2] = '\'';
- bytes[size + 3] = '\0';
- return bytes;
- }
-
- /* The character isn't valid UTF-8, so we'll only print that first byte */
- if (isprint(firstByte)) {
- /* bytes[1] = firstByte; */
- bytes[0] = '\'';
- bytes[2] = '\'';
- bytes[3] = '\0';
- return bytes;
- }
- /* Well then, print its hex value */
- static char const hexChars[16] = "0123456789ABCDEF";
-
- bytes[0] = '0';
- bytes[1] = 'x';
- bytes[2] = hexChars[firstByte >> 4];
- bytes[3] = hexChars[firstByte & 0x0f];
- bytes[4] = '\0';
- return bytes;
-}
-
/* Lexer core */
static int yylex_SKIP_TO_ENDC(void); // forward declaration for yylex_NORMAL
@@ -2118,7 +2069,7 @@
/* Do not report weird characters when capturing, it'll be done later */
if (!lexerState->capturing) {
/* TODO: try to group reportings */
- error("Unknown character %s\n", reportGarbageChar(c));
+ error("Unknown character '%s'\n", print(c));
}
}
lexerState->atLineStart = false;
--- a/test/asm/garbage_char.err
+++ b/test/asm/garbage_char.err
@@ -1,3 +1,3 @@
ERROR: garbage_char.asm(1):
- Unknown character 0xFF
+ Unknown character '\xff'
error: Assembly aborted (1 error)!
--- a/test/asm/invalid-utf-8.err
+++ b/test/asm/invalid-utf-8.err
@@ -1,5 +1,5 @@
ERROR: invalid-utf-8.asm(6) -> invalid-utf-8.asm::m(4):
- Unknown character 0xCF
+ Unknown character '\xcf'
ERROR: invalid-utf-8.asm(6) -> invalid-utf-8.asm::m(4):
- Unknown character 0xD3
+ Unknown character '\xd3'
error: Assembly aborted (2 errors)!
--- a/test/asm/null-in-macro.err
+++ b/test/asm/null-in-macro.err
@@ -1,3 +1,3 @@
ERROR: null-in-macro.asm(4) -> null-in-macro.asm::foo(2):
- Unknown character 0x00
+ Unknown character '\x00'
error: Assembly aborted (1 error)!