ref: 4c9a929a14d76609c26c06eeac0371d531f670c8
parent: 71f88717024cd5434ff7ee541a112ece86082121
author: ISSOtm <[email protected]>
date: Tue Jul 28 18:06:03 EDT 2020
Implement almost all functionality Add keywords and identifiers Add comments Add number literals Add strings Add a lot of new tokens Add (and clean up) IF etc. Improve reporting of unexpected chars / garbage bytes Fix bug with and improved error messages when failing to open file Add verbose-level messages about how files are opened Enforce that files finish with a newline Fix chars returned not being cast to unsigned char (may conflict w/ EOF) Return null path when no file is open, rather than crash Unify and improve error printing slightly Known to be missing: macro expansion, REPT blocks, EQUS expansions
--- a/include/asm/asm.h
+++ b/include/asm/asm.h
@@ -27,9 +27,5 @@
extern uint32_t nTotalLines;
extern uint32_t nIFDepth;
extern struct Section *pCurrentSection;
-extern bool oDontExpandStrings;
-
-size_t symvaluetostring(char *dest, size_t maxLength, char *sym,
- const char *mode);
#endif /* RGBDS_ASM_ASM_H */
--- a/include/asm/fstack.h
+++ b/include/asm/fstack.h
@@ -32,7 +32,7 @@
uint32_t uniqueID;
int32_t nLine;
uint32_t nStatus;
- char *pREPTBlock;
+ char const *pREPTBlock;
uint32_t nREPTBlockCount;
uint32_t nREPTBlockSize;
int32_t nREPTBodyFirstLine;
@@ -47,7 +47,7 @@
void fstk_DumpToStr(char *buf, size_t len);
void fstk_AddIncludePath(char *s);
void fstk_RunMacro(char *s, struct MacroArgs *args);
-void fstk_RunRept(uint32_t count, int32_t nReptLineNo);
+void fstk_RunRept(uint32_t count, int32_t nReptLineNo, char const *body, size_t size);
/**
* @param path The user-provided file name
* @param fullPath The address of a pointer, which will be made to point at the full path
--- a/include/asm/lexer.h
+++ b/include/asm/lexer.h
@@ -33,10 +33,13 @@
struct LexerState *lexer_OpenFile(char const *path);
struct LexerState *lexer_OpenFileView(void);
void lexer_DeleteState(struct LexerState *state);
+void lexer_Init(void);
enum LexerMode {
LEXER_NORMAL,
- LEXER_RAW
+ LEXER_RAW,
+ LEXER_SKIP_TO_ELIF,
+ LEXER_SKIP_TO_ENDC
};
void lexer_SetMode(enum LexerMode mode);
@@ -47,7 +50,7 @@
uint32_t lexer_GetColNo(void);
void lexer_DumpStringExpansions(void);
int yylex(void);
-void lexer_SkipToBlockEnd(int blockStartToken, int blockEndToken, int endToken,
- char const **capture, size_t *size, char const *name);
+void lexer_CaptureBlock(int blockStartToken, int blockEndToken, char const **capture, size_t *size,
+ char const *name);
#endif /* RGBDS_ASM_LEXER_H */
--- a/include/asm/symbol.h
+++ b/include/asm/symbol.h
@@ -44,8 +44,8 @@
int32_t (*callback)(void);
};
struct { /* For SYM_MACRO */
- uint32_t macroSize;
- char *macro;
+ size_t macroSize;
+ char const *macro;
};
};
@@ -114,9 +114,10 @@
struct Symbol *sym_AddEqu(char const *symName, int32_t value);
struct Symbol *sym_AddSet(char const *symName, int32_t value);
uint32_t sym_GetPCValue(void);
+uint32_t sym_GetConstantSymValue(struct Symbol const *sym);
uint32_t sym_GetConstantValue(char const *s);
struct Symbol *sym_FindSymbol(char const *symName);
-struct Symbol *sym_AddMacro(char const *symName, int32_t defLineNo);
+struct Symbol *sym_AddMacro(char const *symName, int32_t defLineNo, char const *body, size_t size);
struct Symbol *sym_Ref(char const *symName);
struct Symbol *sym_AddString(char const *symName, char const *value);
uint32_t sym_GetDefinedValue(char const *s);
--- a/include/asm/util.h
+++ b/include/asm/util.h
@@ -12,6 +12,7 @@
#include <stdint.h>
uint32_t calchash(const char *s);
+char const *print(char c);
size_t readUTF8Char(uint8_t *dest, char const *src);
#endif /* RGBDS_UTIL_H */
--- a/src/asm/asmy.y
+++ b/src/asm/asmy.y
@@ -39,64 +39,8 @@
char *tzNewMacro;
uint32_t ulNewMacroSize;
int32_t nPCOffset;
-bool skipElifs; /* If this is set, ELIFs cannot be executed anymore */
+bool executedIfBlock; /* If this is set, ELIFs cannot be executed anymore */
-size_t symvaluetostring(char *dest, size_t maxLength, char *symName,
- const char *mode)
-{
- size_t length;
- struct Symbol *sym = sym_FindSymbol(symName);
-
- if (sym && sym->type == SYM_EQUS) {
- char const *src = sym_GetStringValue(sym);
- size_t i;
-
- if (mode)
- error("Print types are only allowed for numbers\n");
-
- for (i = 0; src[i] != 0; i++) {
- if (i >= maxLength)
- fatalerror("Symbol value too long to fit buffer\n");
-
- dest[i] = src[i];
- }
-
- length = i;
-
- } else {
- uint32_t value = sym_GetConstantValue(symName);
- int32_t fullLength;
-
- /* Special cheat for binary */
- if (mode && !mode[0]) {
- char binary[33]; /* 32 bits + 1 terminator */
- char *write_ptr = binary + 32;
- fullLength = 0;
- binary[32] = 0;
- do {
- *(--write_ptr) = (value & 1) + '0';
- value >>= 1;
- fullLength++;
- } while(value);
- strncpy(dest, write_ptr, maxLength + 1);
- } else {
- fullLength = snprintf(dest, maxLength + 1,
- mode ? mode : "$%" PRIX32,
- value);
- }
-
- if (fullLength < 0) {
- fatalerror("snprintf encoding error\n");
- } else {
- length = (size_t)fullLength;
- if (length > maxLength)
- fatalerror("Symbol value too long to fit buffer\n");
- }
- }
-
- return length;
-}
-
static uint32_t str2int2(uint8_t *s, int32_t length)
{
int32_t i;
@@ -388,18 +332,71 @@
| lines {
nListCountEmpty = 0;
nPCOffset = 0;
- } line '\n' {
+ } line {
nTotalLines++;
}
;
-line : label
- | label cpu_command
- | label macro
- | label simple_pseudoop
- | pseudoop
+line : label '\n'
+ | label cpu_command '\n'
+ | label macro '\n'
+ | label simple_pseudoop '\n'
+ | pseudoop '\n'
+ | conditional /* May not necessarily be followed by a newline, see below */
;
+/*
+ * For "logistical" reasons, conditionals must manage newlines themselves.
+ * This is because we need to switch the lexer's mode *after* the newline has been read,
+ * and to avoid causing some grammar conflicts (token reducing is finicky).
+ * This is DEFINITELY one of the more FRAGILE parts of the codebase, handle with care.
+ */
+conditional : if
+ /* It's important that all of these require being at line start for `skipIfBlock` */
+ | elif
+ | else
+ | endc
+;
+
+if : T_POP_IF const '\n' {
+ nIFDepth++;
+ executedIfBlock = !!$2;
+ if (!executedIfBlock)
+ lexer_SetMode(LEXER_SKIP_TO_ELIF);
+ }
+;
+
+elif : T_POP_ELIF const '\n' {
+ if (nIFDepth <= 0)
+ fatalerror("Found ELIF outside an IF construct\n");
+
+ if (executedIfBlock) {
+ lexer_SetMode(LEXER_SKIP_TO_ENDC);
+ } else {
+ executedIfBlock = !!$2;
+ if (!executedIfBlock)
+ lexer_SetMode(LEXER_SKIP_TO_ELIF);
+ }
+ }
+;
+
+else : T_POP_ELSE '\n' {
+ if (nIFDepth <= 0)
+ fatalerror("Found ELSE outside an IF construct\n");
+
+ if (executedIfBlock)
+ lexer_SetMode(LEXER_SKIP_TO_ENDC);
+ }
+;
+
+endc : T_POP_ENDC '\n' {
+ if (nIFDepth <= 0)
+ fatalerror("Found ENDC outside an IF construct\n");
+
+ nIFDepth--;
+ }
+;
+
scoped_id : T_ID | T_LOCAL_ID ;
label : /* empty */
@@ -460,10 +457,6 @@
| printt
| printv
| printi
- | if
- | elif
- | else
- | endc
| export
| db
| dw
@@ -606,9 +599,9 @@
uint32_t nDefinitionLineNo = lexer_GetLineNo();
char const *body;
size_t size;
- lexer_SkipToBlockEnd(T_POP_REPT, T_POP_ENDR, T_POP_ENDR,
- &body, &size, "REPT block");
- fstk_RunRept($2, nDefinitionLineNo);
+ lexer_CaptureBlock(T_POP_REPT, T_POP_ENDR, &body, &size,
+ "REPT block");
+ fstk_RunRept($2, nDefinitionLineNo, body, size);
}
;
@@ -616,9 +609,9 @@
int32_t nDefinitionLineNo = lexer_GetLineNo();
char const *body;
size_t size;
- lexer_SkipToBlockEnd(T_POP_MACRO, T_POP_ENDM, T_POP_ENDM,
- &body, &size, "macro definition");
- sym_AddMacro($1, nDefinitionLineNo);
+ lexer_CaptureBlock(T_POP_MACRO, T_POP_ENDM, &body, &size,
+ "macro definition");
+ sym_AddMacro($1, nDefinitionLineNo, body, size);
}
;
@@ -784,72 +777,6 @@
;
printf : T_POP_PRINTF const { math_Print($2); }
-;
-
-if : T_POP_IF const {
- nIFDepth++;
- if (!$2) {
- /* The function is hardcoded to also stop on T_POP_ELSE and ENDC */
- lexer_SkipToBlockEnd(T_POP_IF, T_POP_ENDC, T_POP_ELIF,
- NULL, NULL, "if block");
- skipElifs = false;
- } else {
- skipElifs = true;
- }
- }
-;
-
-elif : T_POP_ELIF const {
- if (nIFDepth <= 0)
- fatalerror("Found ELIF outside an IF construct\n");
-
- if (skipElifs) {
- /*
- * Executed when ELIF is reached at the end of
- * an IF or ELIF block for which the condition
- * was true.
- *
- * Continue parsing at ENDC keyword
- */
- lexer_SkipToBlockEnd(T_POP_IF, T_POP_ENDC, T_POP_ENDC,
- NULL, NULL, "elif block");
- } else {
- /*
- * Executed when ELIF is skipped to because the
- * condition of the previous IF or ELIF block
- * was false.
- */
-
- if (!$2) {
- /*
- * Continue parsing after ELSE, or at
- * ELIF or ENDC keyword.
- */
- lexer_SkipToBlockEnd(T_POP_IF, T_POP_ENDC, T_POP_ELIF,
- NULL, NULL, "elif block");
- } else {
- skipElifs = true;
- }
- }
- }
-;
-
-else : T_POP_ELSE {
- if (nIFDepth <= 0)
- fatalerror("Found ELSE outside an IF construct\n");
-
- /* Continue parsing at ENDC keyword */
- lexer_SkipToBlockEnd(T_POP_IF, T_POP_ENDC, T_POP_ENDC,
- NULL, NULL, "else block");
- }
-;
-
-endc : T_POP_ENDC {
- if (nIFDepth <= 0)
- fatalerror("Found ENDC outside an IF construct\n");
-
- nIFDepth--;
- }
;
const_3bit : const {
--- a/src/asm/fstack.c
+++ b/src/asm/fstack.c
@@ -41,7 +41,7 @@
static int32_t NextIncPath;
static uint32_t nMacroCount;
-static char *pCurrentREPTBlock;
+static char const *pCurrentREPTBlock;
static uint32_t nCurrentREPTBlockSize;
static uint32_t nCurrentREPTBlockCount;
static int32_t nCurrentREPTBodyFirstLine;
@@ -249,9 +249,11 @@
pLastFile->nLine);
pLastFile = pLastFile->next;
}
+ char const *fileName = lexer_GetFileName();
- fprintf(stderr, "%s(%" PRId32 ",%" PRId32 ")",
- lexer_GetFileName(), lexer_GetLineNo(), lexer_GetColNo());
+ if (fileName)
+ fprintf(stderr, "%s(%" PRId32 ",%" PRId32 "): ",
+ fileName, lexer_GetLineNo(), lexer_GetColNo());
}
void fstk_DumpToStr(char *buf, size_t buflen)
@@ -425,7 +427,7 @@
/*
* Set up a repeat block for parsing
*/
-void fstk_RunRept(uint32_t count, int32_t nReptLineNo)
+void fstk_RunRept(uint32_t count, int32_t nReptLineNo, char const *body, size_t size)
{
if (count) {
pushcontext();
@@ -432,8 +434,8 @@
macro_SetUniqueID(nMacroCount++);
nCurrentREPTBlockCount = count;
nCurrentStatus = STAT_isREPTBlock;
- nCurrentREPTBlockSize = ulNewMacroSize;
- pCurrentREPTBlock = tzNewMacro;
+ nCurrentREPTBlockSize = size;
+ pCurrentREPTBlock = body;
nCurrentREPTBodyFirstLine = nReptLineNo + 1;
}
}
--- a/src/asm/lexer.c
+++ b/src/asm/lexer.c
@@ -9,8 +9,10 @@
#include <sys/mman.h>
#include <sys/stat.h>
#include <assert.h>
+#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
+#include <inttypes.h>
#include <limits.h>
#include <stdbool.h>
#include <stdint.h>
@@ -19,13 +21,208 @@
#include <string.h>
#include <unistd.h>
+#include "extern/utf8decoder.h"
+
+#include "asm/asm.h"
#include "asm/lexer.h"
+#include "asm/macro.h"
+#include "asm/main.h"
#include "asm/rpn.h"
-#include "asm/symbol.h" /* For MAXSYMLEN in asmy.h */
+#include "asm/symbol.h"
+#include "asm/util.h"
#include "asm/warning.h"
/* Include this last so it gets all type & constant definitions */
#include "asmy.h" /* For token definitions, generated from asmy.y */
+/*
+ * Identifiers that are also keywords are listed here. This ONLY applies to ones
+ * that would normally be matched as identifiers! Check out `yylex_NORMAL` to
+ * see how this is used.
+ * Tokens / keywords not handled here are handled in `yylex_NORMAL`'s switch.
+ */
+static struct KeywordMapping {
+ char const *name;
+ int token;
+} const keywords[] = {
+ /*
+ * CAUTION when editing this: adding keywords will probably require extra nodes in the
+ * `keywordDict` array. If you forget to, you will probably trip up an assertion, anyways.
+ * Also, all entries in this array must be in uppercase for the dict to build correctly.
+ */
+ {"ADC", T_Z80_ADC},
+ {"ADD", T_Z80_ADD},
+ {"AND", T_Z80_AND},
+ {"BIT", T_Z80_BIT},
+ {"CALL", T_Z80_CALL},
+ {"CCF", T_Z80_CCF},
+ {"CPL", T_Z80_CPL},
+ {"CP", T_Z80_CP},
+ {"DAA", T_Z80_DAA},
+ {"DEC", T_Z80_DEC},
+ {"DI", T_Z80_DI},
+ {"EI", T_Z80_EI},
+ {"HALT", T_Z80_HALT},
+ {"INC", T_Z80_INC},
+ {"JP", T_Z80_JP},
+ {"JR", T_Z80_JR},
+ {"LD", T_Z80_LD},
+ {"LDI", T_Z80_LDI},
+ {"LDD", T_Z80_LDD},
+ {"LDIO", T_Z80_LDIO},
+ {"LDH", T_Z80_LDIO},
+ {"NOP", T_Z80_NOP},
+ {"OR", T_Z80_OR},
+ {"POP", T_Z80_POP},
+ {"PUSH", T_Z80_PUSH},
+ {"RES", T_Z80_RES},
+ {"RETI", T_Z80_RETI},
+ {"RET", T_Z80_RET},
+ {"RLCA", T_Z80_RLCA},
+ {"RLC", T_Z80_RLC},
+ {"RLA", T_Z80_RLA},
+ {"RL", T_Z80_RL},
+ {"RRC", T_Z80_RRC},
+ {"RRCA", T_Z80_RRCA},
+ {"RRA", T_Z80_RRA},
+ {"RR", T_Z80_RR},
+ {"RST", T_Z80_RST},
+ {"SBC", T_Z80_SBC},
+ {"SCF", T_Z80_SCF},
+ {"SET", T_POP_SET},
+ {"SLA", T_Z80_SLA},
+ {"SRA", T_Z80_SRA},
+ {"SRL", T_Z80_SRL},
+ {"STOP", T_Z80_STOP},
+ {"SUB", T_Z80_SUB},
+ {"SWAP", T_Z80_SWAP},
+ {"XOR", T_Z80_XOR},
+
+ {"NZ", T_CC_NZ},
+ {"Z", T_CC_Z},
+ {"NC", T_CC_NC},
+ /* Handled in list of registers */
+ /* { "C", T_CC_C }, */
+
+ {"AF", T_MODE_AF},
+ {"BC", T_MODE_BC},
+ {"DE", T_MODE_DE},
+ {"HL", T_MODE_HL},
+ {"SP", T_MODE_SP},
+
+ {"A", T_TOKEN_A},
+ {"B", T_TOKEN_B},
+ {"C", T_TOKEN_C},
+ {"D", T_TOKEN_D},
+ {"E", T_TOKEN_E},
+ {"H", T_TOKEN_H},
+ {"L", T_TOKEN_L},
+
+ {"DEF", T_OP_DEF},
+
+ {"FRAGMENT", T_POP_FRAGMENT},
+ {"BANK", T_OP_BANK},
+ {"ALIGN", T_OP_ALIGN},
+
+ {"ROUND", T_OP_ROUND},
+ {"CEIL", T_OP_CEIL},
+ {"FLOOR", T_OP_FLOOR},
+ {"DIV", T_OP_FDIV},
+ {"MUL", T_OP_FMUL},
+ {"SIN", T_OP_SIN},
+ {"COS", T_OP_COS},
+ {"TAN", T_OP_TAN},
+ {"ASIN", T_OP_ASIN},
+ {"ACOS", T_OP_ACOS},
+ {"ATAN", T_OP_ATAN},
+ {"ATAN2", T_OP_ATAN2},
+
+ {"HIGH", T_OP_HIGH},
+ {"LOW", T_OP_LOW},
+ {"ISCONST", T_OP_ISCONST},
+
+ {"STRCMP", T_OP_STRCMP},
+ {"STRIN", T_OP_STRIN},
+ {"STRSUB", T_OP_STRSUB},
+ {"STRLEN", T_OP_STRLEN},
+ {"STRCAT", T_OP_STRCAT},
+ {"STRUPR", T_OP_STRUPR},
+ {"STRLWR", T_OP_STRLWR},
+
+ {"INCLUDE", T_POP_INCLUDE},
+ {"PRINTT", T_POP_PRINTT},
+ {"PRINTI", T_POP_PRINTI},
+ {"PRINTV", T_POP_PRINTV},
+ {"PRINTF", T_POP_PRINTF},
+ {"EXPORT", T_POP_EXPORT},
+ {"XDEF", T_POP_XDEF},
+ {"GLOBAL", T_POP_GLOBAL},
+ {"DS", T_POP_DS},
+ {"DB", T_POP_DB},
+ {"DW", T_POP_DW},
+ {"DL", T_POP_DL},
+ {"SECTION", T_POP_SECTION},
+ {"PURGE", T_POP_PURGE},
+
+ {"RSRESET", T_POP_RSRESET},
+ {"RSSET", T_POP_RSSET},
+
+ {"INCBIN", T_POP_INCBIN},
+ {"CHARMAP", T_POP_CHARMAP},
+ {"NEWCHARMAP", T_POP_NEWCHARMAP},
+ {"SETCHARMAP", T_POP_SETCHARMAP},
+ {"PUSHC", T_POP_PUSHC},
+ {"POPC", T_POP_POPC},
+
+ {"FAIL", T_POP_FAIL},
+ {"WARN", T_POP_WARN},
+ {"FATAL", T_POP_FATAL},
+ {"ASSERT", T_POP_ASSERT},
+ {"STATIC_ASSERT", T_POP_STATIC_ASSERT},
+
+ {"MACRO", T_POP_MACRO},
+ {"ENDM", T_POP_ENDM},
+ {"SHIFT", T_POP_SHIFT},
+
+ {"REPT", T_POP_REPT},
+ {"ENDR", T_POP_ENDR},
+
+ {"LOAD", T_POP_LOAD},
+ {"ENDL", T_POP_ENDL},
+
+ {"IF", T_POP_IF},
+ {"ELSE", T_POP_ELSE},
+ {"ELIF", T_POP_ELIF},
+ {"ENDC", T_POP_ENDC},
+
+ {"UNION", T_POP_UNION},
+ {"NEXTU", T_POP_NEXTU},
+ {"ENDU", T_POP_ENDU},
+
+ {"WRAM0", T_SECT_WRAM0},
+ {"VRAM", T_SECT_VRAM},
+ {"ROMX", T_SECT_ROMX},
+ {"ROM0", T_SECT_ROM0},
+ {"HRAM", T_SECT_HRAM},
+ {"WRAMX", T_SECT_WRAMX},
+ {"SRAM", T_SECT_SRAM},
+ {"OAM", T_SECT_OAM},
+
+ {"RB", T_POP_RB},
+ {"RW", T_POP_RW},
+ {"EQU", T_POP_EQU},
+ {"EQUS", T_POP_EQUS},
+
+ /* Handled before in list of CPU instructions */
+ /* {"SET", T_POP_SET}, */
+
+ {"PUSHS", T_POP_PUSHS},
+ {"POPS", T_POP_POPS},
+ {"PUSHO", T_POP_PUSHO},
+ {"POPO", T_POP_POPO},
+
+ {"OPT", T_POP_OPT}
+};
+
#define LEXER_BUF_SIZE 42 /* TODO: determine a sane value for this */
/* This caps the size of buffer reads, and according to POSIX, passing more than SSIZE_MAX is UB */
static_assert(LEXER_BUF_SIZE <= SSIZE_MAX);
@@ -60,6 +257,7 @@
bool atLineStart;
uint32_t lineNo;
uint32_t colNo;
+ int lastToken;
bool capturing; /* Whether the text being lexed should be captured */
size_t captureSize; /* Amount of text captured */
@@ -83,12 +281,17 @@
if (isStdin)
path = "<stdin>";
if (!state) {
- error("Failed to open file \"%s\": %s\n", path, strerror(errno));
+ error("Failed to allocate memory for lexer state: %s\n", strerror(errno));
return NULL;
}
state->path = path;
state->fd = isStdin ? STDIN_FILENO : open(path, O_RDONLY);
+ if (state->fd == -1) {
+ error("Failed to open file \"%s\": %s\n", path, strerror(errno));
+ free(state);
+ return NULL;
+ }
state->isMmapped = false; /* By default, assume it won't be mmap()ed */
off_t size = lseek(state->fd, 0, SEEK_END);
@@ -121,10 +324,16 @@
state->isMmapped = true;
state->ptr = pa;
state->size = size;
+
+ if (verbose)
+ printf("File %s successfully mmap()ped\n", path);
}
}
if (!state->isMmapped) {
/* Sometimes mmap() fails or isn't available, so have a fallback */
+ if (verbose)
+ printf("File %s opened as regular, errno reports \"%s\"\n",
+ path, strerror(errno));
lseek(state->fd, 0, SEEK_SET);
state->index = 0;
}
@@ -132,6 +341,7 @@
state->mode = LEXER_NORMAL;
state->atLineStart = true; /* yylex() will init colNo due to this */
state->lineNo = 0;
+ state->lastToken = 0;
state->capturing = false;
state->captureBuf = NULL;
@@ -156,6 +366,72 @@
free(state);
}
+struct KeywordDictNode {
+ /*
+ * The identifier charset is (currently) 44 characters big. By storing entries for the
+ * entire printable ASCII charset, minus lower-case due to case-insensitivity,
+ * we only waste (0x60 - 0x20) - 70 = 20 entries per node, which should be acceptable.
+ * In turn, this allows greatly simplifying checking an index into this array,
+ * which should help speed up the lexer.
+ */
+ uint16_t children[0x60 - ' '];
+ struct KeywordMapping const *keyword;
+/* Since the keyword structure is invariant, the min number of nodes is known at compile time */
+} keywordDict[336] = {0}; /* Make sure to keep this correct when adding keywords! */
+
+/* Convert a char into its index into the dict */
+static inline uint8_t dictIndex(char c)
+{
+ /* Translate uppercase to lowercase (roughly) */
+ if (c > 0x60)
+ c = c - ('a' - 'A');
+ return c - ' ';
+}
+
+void lexer_Init(void)
+{
+ /*
+ * Build the dictionary of keywords. This could be done at compile time instead, however:
+ * - Doing so manually is a task nobody wants to undertake
+ * - It would be massively hard to read
+ * - Doing it within CC or CPP would be quite non-trivial
+ * - Doing it externally would require some extra work to use only POSIX tools
+ * - The startup overhead isn't much compared to the program's
+ */
+ uint16_t usedNodes = 1;
+
+ for (size_t i = 0; i < sizeof(keywords) / sizeof(*keywords); i++) {
+ uint16_t nodeID = 0;
+
+ /* Walk the dictionary, creating intermediate nodes for the keyword */
+ for (char const *ptr = keywords[i].name; *ptr; ptr++) {
+ /* We should be able to assume all entries are well-formed */
+ if (keywordDict[nodeID].children[*ptr - ' '] == 0) {
+ /*
+ * If this gets tripped up, set the size of keywordDict to
+ * something high, compile with `-DPRINT_NODE_COUNT` (see below),
+ * and set the size to that.
+ */
+ assert(usedNodes < sizeof(keywordDict) / sizeof(*keywordDict));
+
+ /* There is no node at that location, grab one from the pool */
+ keywordDict[nodeID].children[*ptr - ' '] = usedNodes;
+ usedNodes++;
+ }
+ nodeID = keywordDict[nodeID].children[*ptr - ' '];
+ }
+
+ /* This assumes that no two keywords have the same name */
+ keywordDict[nodeID].keyword = &keywords[i];
+ }
+
+#ifdef PRINT_NODE_COUNT /* For the maintainer to check how many nodes are needed */
+ printf("Lexer keyword dictionary: %zu keywords in %u nodes (pool size %zu)\n",
+ sizeof(keywords) / sizeof(*keywords), usedNodes,
+ sizeof(keywordDict) / sizeof(*keywordDict));
+#endif
+}
+
void lexer_SetMode(enum LexerMode mode)
{
lexerState->mode = mode;
@@ -187,7 +463,16 @@
if (lexerState->offset + distance >= lexerState->size)
return EOF;
+ /*
+ * Note: the following block is also duplicated for the non-mmap() path. This sucks.
+ * However, due to subtle handling differences, I haven't found a clean way to
+ * avoid that duplication. If you have any ideas, please discuss them in an issue or
+ * pull request. Thank you!
+ */
+
+ /* Do not perform expansions while capturing */
if (!lexerState->capturing) {
+ /* Scan the newly-inserted chars for any macro args */
bool escaped = false;
while (lexerState->nbChars < distance && !escaped) {
@@ -204,7 +489,7 @@
}
}
- return lexerState->ptr[lexerState->offset + distance];
+ return (unsigned char)lexerState->ptr[lexerState->offset + distance];
}
if (lexerState->nbChars <= distance) {
@@ -240,7 +525,7 @@
/* Do not perform expansions when capturing */
if (!lexerState->capturing) {
- /* Scan the newly-inserted chars for any expansions */
+ /* Scan the newly-inserted chars for any macro args */
bool escaped = false;
size_t index = (lexerState->index + lexerState->nbChars) % LEXER_BUF_SIZE;
@@ -276,7 +561,7 @@
if (lexerState->nbChars <= distance)
return EOF;
}
- return lexerState->buf[(lexerState->index + distance) % LEXER_BUF_SIZE];
+ return (unsigned char)lexerState->buf[(lexerState->index + distance) % LEXER_BUF_SIZE];
}
static void shiftChars(uint8_t distance)
@@ -320,7 +605,7 @@
char const *lexer_GetFileName(void)
{
- return lexerState->path;
+ return lexerState ? lexerState->path : NULL;
}
uint32_t lexer_GetLineNo(void)
@@ -338,6 +623,457 @@
/* TODO */
}
+/* Function to discard all of a line's comments */
+
+static void discardComment(void)
+{
+ for (;;) {
+ int c = peek(0);
+
+ if (c == EOF || c == '\r' || c == '\n')
+ break;
+ shiftChars(1);
+ }
+}
+
+/* Functions to lex numbers of various radixes */
+
+static void readNumber(int radix, int32_t baseValue)
+{
+ uint32_t value = baseValue;
+
+ for (;;) {
+ int c = peek(0);
+
+ if (c < '0' || c > '0' + radix - 1)
+ break;
+ if (value > UINT32_MAX / radix)
+ warning(WARNING_LARGE_CONSTANT, "Integer constant is too large\n");
+ value = value * radix + (c - '0');
+
+ shiftChars(1);
+ }
+
+ yylval.nConstValue = value;
+}
+
+static void readFractionalPart(void)
+{
+ uint32_t value = 0, divisor = 1;
+
+ for (;;) {
+ int c = peek(0);
+
+ if (c < '0' || c > '9')
+ break;
+ if (divisor > UINT32_MAX / 10) {
+ warning(WARNING_LARGE_CONSTANT,
+ "Precision of fixed-point constant is too large\n");
+ /* Discard any additional digits */
+ while (c = peek(0), c >= '0' && c <= '9')
+ shiftChars(1);
+ break;
+ }
+ }
+
+ if (yylval.nConstValue > INT16_MAX || yylval.nConstValue < INT16_MIN)
+ warning(WARNING_LARGE_CONSTANT, "Magnitude of fixed-point constant is too large\n");
+
+ /* Cast to unsigned avoids UB if shifting discards bits */
+ yylval.nConstValue = (uint32_t)yylval.nConstValue << 16;
+ /* Cast to unsigned avoids undefined overflow behavior */
+ uint16_t fractional = value * 65536 / divisor;
+
+ yylval.nConstValue |= fractional * (yylval.nConstValue >= 0 ? 1 : -1);
+}
+
+static void readBinaryNumber(void)
+{
+ uint32_t value = 0;
+
+ for (;;) {
+ int c = peek(0);
+
+ /* TODO: handle `-b`'s dynamic chars */
+ if (c != '0' && c != '1')
+ break;
+ value = value * 2 + (c - '0');
+
+ shiftChars(1);
+ }
+
+ yylval.nConstValue = value;
+}
+
+static void readHexNumber(void)
+{
+ uint32_t value = 0;
+ bool empty = true;
+
+ for (;;) {
+ int c = peek(0);
+
+ if (c >= 'a' && c <= 'f') /* Convert letters to right after digits */
+ c = c - 'a' + 10;
+ else if (c >= 'A' && c <= 'F')
+ c = c - 'A' + 10;
+ else if (c >= '0' && c <= '9')
+ c = c - '0';
+ else
+ break;
+
+ if (value > UINT32_MAX / 16)
+ warning(WARNING_LARGE_CONSTANT, "Integer constant is too large\n");
+ value = value * 16 + c;
+
+ shiftChars(1);
+ empty = false;
+ }
+
+ if (empty)
+ error("Invalid integer constant, no digits after '$'\n");
+
+ yylval.nConstValue = value;
+}
+
+static void readGfxConstant(void)
+{
+ uint32_t bp0 = 0, bp1 = 0;
+ uint8_t width = 0;
+
+ for (;;) {
+ int c = peek(0);
+
+ /* TODO: handle `-g`'s dynamic chars */
+ if (c < '0' || c > '3')
+ break;
+ uint8_t pixel = c - '0';
+
+ if (width < 8) {
+ bp0 = bp0 << 1 | (pixel & 1);
+ bp1 = bp1 << 1 | (pixel >> 1);
+ }
+ if (width <= 8)
+ width++;
+ shiftChars(1);
+ }
+
+ if (width == 0)
+ error("Invalid gfx constant, no digits after '`'\n");
+ else if (width == 8)
+ warning(WARNING_LARGE_CONSTANT,
+ "Gfx constant is too large, only 8 first pixels considered\n");
+
+ yylval.nConstValue = bp1 << 8 | bp0;
+}
+
+/* Function to read identifiers & keywords */
+
+static int readIdentifier(char firstChar)
+{
+ /* Lex while checking for a keyword */
+ yylval.tzSym[0] = firstChar;
+ uint16_t nodeID = keywordDict[0].children[dictIndex(firstChar)];
+ int tokenType = firstChar == '.' ? T_LOCAL_ID : T_ID;
+ size_t i;
+
+ for (i = 1; ; i++) {
+ int c = peek(0);
+
+ /* If that char isn't in the symbol charset, end */
+ if ((c > '9' || c < '0')
+ && (c > 'Z' || c < 'A')
+ && (c > 'z' || c < 'a')
+ && c != '#' && c != '.' && c != '@' && c != '_')
+ break;
+ shiftChars(1);
+
+ /* Write the char to the identifier's name */
+ if (i < sizeof(yylval.tzSym) - 1)
+ yylval.tzSym[i] = c;
+
+ /* If the char was a dot, mark the identifier as local */
+ if (c == '.')
+ tokenType = T_LOCAL_ID;
+
+ /* Attempt to traverse the tree to check for a keyword */
+ if (nodeID) /* Do nothing if matching already failed */
+ nodeID = keywordDict[nodeID].children[dictIndex(c)];
+ }
+
+ if (i > sizeof(yylval.tzSym) - 1) {
+ warning(WARNING_LONG_STR, "Symbol name too long, got truncated\n");
+ i = sizeof(yylval.tzSym) - 1;
+ }
+ yylval.tzSym[i] = '\0'; /* Terminate the string */
+
+ if (keywordDict[nodeID].keyword)
+ return keywordDict[nodeID].keyword->token;
+
+ return tokenType;
+}
+
+/* Functions to read strings */
+
+enum PrintType {
+ TYPE_NONE,
+ TYPE_DECIMAL, /* d */
+ TYPE_UPPERHEX, /* X */
+ TYPE_LOWERHEX, /* x */
+ TYPE_BINARY, /* b */
+};
+
+static void intToString(char *dest, size_t bufSize, struct Symbol const *sym, enum PrintType type)
+{
+ uint32_t value = sym_GetConstantSymValue(sym);
+ int fullLength;
+
+ /* Special cheat for binary */
+ if (type == TYPE_BINARY) {
+ char binary[33]; /* 32 bits + 1 terminator */
+ char *write_ptr = binary + 32;
+
+ fullLength = 0;
+ binary[32] = 0;
+ do {
+ *(--write_ptr) = (value & 1) + '0';
+ value >>= 1;
+ fullLength++;
+ } while (value);
+ strncpy(dest, write_ptr, bufSize - 1);
+ } else {
+ static char const * const formats[] = {
+ [TYPE_NONE] = "$%" PRIX32,
+ [TYPE_DECIMAL] = "%" PRId32,
+ [TYPE_UPPERHEX] = "%" PRIX32,
+ [TYPE_LOWERHEX] = "%" PRIx32
+ };
+
+ fullLength = snprintf(dest, bufSize, formats[type], value);
+ if (fullLength < 0) {
+ error("snprintf encoding error: %s\n", strerror(errno));
+ dest[0] = '\0';
+ }
+ }
+
+ if ((size_t)fullLength >= bufSize)
+ warning(WARNING_LONG_STR, "Interpolated symbol %s too long to fit buffer\n",
+ sym->name);
+}
+
+static char const *readInterpolation(void)
+{
+ char symName[MAXSYMLEN + 1];
+ size_t i = 0;
+ enum PrintType type = TYPE_NONE;
+
+ for (;;) {
+ int c = peek(0);
+
+ if (c == '{') { /* Nested interpolation */
+ shiftChars(1);
+ char const *inner = readInterpolation();
+
+ if (inner) {
+ while (*inner) {
+ if (i == sizeof(symName))
+ break;
+ symName[i++] = *inner++;
+ }
+ }
+ } else if (c == EOF || c == '\r' || c == '\n' || c == '"') {
+ error("Unterminated interpolation\n");
+ break;
+ } else if (c == '}') {
+ shiftChars(1);
+ break;
+ } else if (c == ':' && type == TYPE_NONE) { /* Print type, only once */
+ if (i != 1) {
+ error("Print types are exactly 1 character long\n");
+ } else {
+ switch (symName[0]) {
+ case 'b':
+ type = TYPE_BINARY;
+ break;
+ case 'd':
+ type = TYPE_DECIMAL;
+ break;
+ case 'X':
+ type = TYPE_UPPERHEX;
+ break;
+ case 'x':
+ type = TYPE_LOWERHEX;
+ break;
+ default:
+ error("Invalid print type '%s'\n", print(symName[0]));
+ }
+ }
+ i = 0; /* Now that type has been set, restart at beginning of string */
+ shiftChars(1);
+ } else {
+ if (i < sizeof(symName)) /* Allow writing an extra char to flag overflow */
+ symName[i++] = c;
+ shiftChars(1);
+ }
+ }
+
+ if (i == sizeof(symName)) {
+ warning(WARNING_LONG_STR, "Symbol name too long\n");
+ i--;
+ }
+ symName[i] = '\0';
+
+ struct Symbol const *sym = sym_FindSymbol(symName);
+
+ if (!sym) {
+ error("Interpolated symbol \"%s\" does not exist\n", symName);
+ } else if (sym->type == SYM_EQUS) {
+ if (type != TYPE_NONE)
+ error("Print types are only allowed for numbers\n");
+ return sym_GetStringValue(sym);
+ } else if (sym_IsNumeric(sym)) {
+ static char buf[33]; /* Worst case of 32 digits + terminator */
+
+ intToString(buf, sizeof(buf), sym, type);
+ return buf;
+ } else {
+ error("Only numerical and string symbols can be interpolated\n");
+ }
+ return NULL;
+}
+
+static void readString(void)
+{
+ size_t i = 0;
+
+ for (;;) {
+ int c = peek(0);
+
+ switch (c) {
+ case '"':
+ shiftChars(1);
+ if (i == sizeof(yylval.tzString)) {
+ i--;
+ warning(WARNING_LONG_STR, "String constant too long\n");
+ }
+ yylval.tzString[i] = '\0';
+ return;
+ case '\r':
+ case '\n': /* Do not shift these! */
+ case EOF:
+ if (i == sizeof(yylval.tzString)) {
+ i--;
+ warning(WARNING_LONG_STR, "String constant too long\n");
+ }
+ yylval.tzString[i] = '\0';
+ error("Unterminated string\n");
+ return;
+
+ case '\\': /* Character escape */
+ c = peek(1);
+ switch (c) {
+ case '\\': /* Return that character unchanged */
+ case '"':
+ case '{':
+ case '}':
+ shiftChars(1);
+ break;
+ case 'n':
+ c = '\n';
+ shiftChars(1);
+ break;
+ case 'r':
+ c = '\r';
+ shiftChars(1);
+ break;
+ case 't':
+ c = '\t';
+ shiftChars(1);
+ break;
+
+ case EOF: /* Can't really print that one */
+ error("Illegal character escape at end of input\n");
+ c = '\\';
+ break;
+ default:
+ error("Illegal character escape '%s'\n", print(c));
+ c = '\\';
+ break;
+ }
+ break;
+
+ case '{': /* Symbol interpolation */
+ shiftChars(1);
+ char const *ptr = readInterpolation();
+
+ if (ptr) {
+ while (*ptr) {
+ if (i == sizeof(yylval.tzString))
+ break;
+ yylval.tzString[i++] = *ptr++;
+ }
+ }
+ continue; /* Do not copy an additional character */
+
+ /* Regular characters will just get copied */
+ }
+ if (i < sizeof(yylval.tzString)) /* Copy one extra to flag overflow */
+ yylval.tzString[i++] = c;
+ shiftChars(1);
+ }
+}
+
+/* Function to report one character's worth of garbage bytes */
+
+static char const *reportGarbageChar(unsigned char firstByte)
+{
+ static char bytes[6 + 2 + 1]; /* Max size of a UTF-8 encoded code point, plus "''\0" */
+ /* First, attempt UTF-8 decoding */
+ uint32_t state = 0; /* UTF8_ACCEPT */
+ uint32_t codepoint;
+ uint8_t size = 0; /* Number of additional bytes to shift */
+
+ bytes[1] = firstByte; /* No need to init the rest of the array */
+ decode(&state, &codepoint, firstByte);
+ while (state != 0 && state != 1 /* UTF8_REJECT */) {
+ int c = peek(size++);
+
+ if (c == EOF)
+ break;
+ bytes[size + 1] = c;
+ decode(&state, &codepoint, c);
+ }
+
+ if (state == 0 && (codepoint > UCHAR_MAX || isprint((unsigned char)codepoint))) {
+ /* Character is valid, printable UTF-8! */
+ shiftChars(size);
+ bytes[0] = '\'';
+ bytes[size + 2] = '\'';
+ bytes[size + 3] = '\0';
+ return bytes;
+ }
+
+ /* The character isn't valid UTF-8, so we'll only print that first byte */
+ if (isprint(firstByte)) {
+ /* bytes[1] = firstByte; */
+ bytes[0] = '\'';
+ bytes[2] = '\'';
+ bytes[3] = '\0';
+ return bytes;
+ }
+ /* Well then, print its hex value */
+ static char const hexChars[16] = "0123456789ABCDEF";
+
+ bytes[0] = '0';
+ bytes[1] = 'x';
+ bytes[2] = hexChars[firstByte >> 4];
+ bytes[3] = hexChars[firstByte & 0x0f];
+ bytes[4] = '\0';
+ return bytes;
+}
+
+/* Lexer core */
+
static int yylex_NORMAL(void)
{
for (;;) {
@@ -344,32 +1080,165 @@
int c = nextChar();
switch (c) {
- case '\n':
- if (lexerStateEOL) {
- lexer_SetState(lexerStateEOL);
- lexerStateEOL = NULL;
- }
- return '\n';
+ /* Ignore whitespace and comments */
- /* Ignore whitespace */
+ case '*':
+ if (!lexerState->atLineStart)
+ return T_OP_MUL;
+ warning(WARNING_OBSOLETE,
+ "'*' is deprecated for comments, please use ';' instead\n");
+ /* fallthrough */
+ case ';':
+ discardComment();
+ /* fallthrough */
case ' ':
case '\t':
break;
- /* Handle single-char tokens */
+ /* Handle unambiguous single-char tokens */
+
+ case '^':
+ return T_OP_XOR;
case '+':
return T_OP_ADD;
case '-':
return T_OP_SUB;
+ case '/':
+ return T_OP_DIV;
+ case '~':
+ return T_OP_NOT;
+ case '@':
+ return T_ID;
+
/* Handle accepted single chars */
+
case '[':
case ']':
case '(':
case ')':
case ',':
+ case ':':
return c;
+ /* Handle ambiguous 1- or 2-char tokens */
+ char secondChar;
+
+ case '|': /* Either binary or logical OR */
+ secondChar = peek(0);
+ if (secondChar == '|') {
+ shiftChars(1);
+ return T_OP_LOGICOR;
+ }
+ return T_OP_OR;
+
+ case '=': /* Either SET alias, or EQ */
+ secondChar = peek(0);
+ if (secondChar == '=') {
+ shiftChars(1);
+ return T_OP_LOGICEQU;
+ }
+ return T_POP_EQUAL;
+
+ case '<': /* Either a LT, LTE, or left shift */
+ secondChar = peek(0);
+ if (secondChar == '=') {
+ shiftChars(1);
+ return T_OP_LOGICLE;
+ } else if (secondChar == '<') {
+ shiftChars(1);
+ return T_OP_SHL;
+ }
+ return T_OP_LOGICLT;
+
+ case '>': /* Either a GT, GTE, or right shift */
+ secondChar = peek(0);
+ if (secondChar == '=') {
+ shiftChars(1);
+ return T_OP_LOGICGE;
+ } else if (secondChar == '>') {
+ shiftChars(1);
+ return T_OP_SHR;
+ }
+ return T_OP_LOGICGT;
+
+ case '!': /* Either a NEQ, or negation */
+ secondChar = peek(0);
+ if (secondChar == '=') {
+ shiftChars(1);
+ return T_OP_LOGICNE;
+ }
+ return T_OP_LOGICNOT;
+
+ /* Handle numbers */
+
+ case '$':
+ yylval.nConstValue = 0;
+ readHexNumber();
+ return T_NUMBER;
+
+ case '0': /* Decimal number */
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ readNumber(10, c - '0');
+ int perhapsPeriod = peek(0);
+
+ if (perhapsPeriod == '.') {
+ shiftChars(1);
+ readFractionalPart();
+ }
+ return T_NUMBER;
+
+ case '&':
+ secondChar = peek(0);
+ if (secondChar == '&') {
+ shiftChars(1);
+ return T_OP_LOGICAND;
+ } else if (secondChar >= '0' && secondChar <= '7') {
+ readNumber(8, 0);
+ return T_NUMBER;
+ }
+ return T_OP_AND;
+
+ case '%': /* Either a modulo, or a binary constant */
+ secondChar = peek(0);
+ if (secondChar != '0' && secondChar != '1')
+ return T_OP_MOD;
+
+ yylval.nConstValue = 0;
+ readBinaryNumber();
+ return T_NUMBER;
+
+ case '`': /* Gfx constant */
+ readGfxConstant();
+ return T_NUMBER;
+
+ /* Handle strings */
+
+ case '"':
+ readString();
+ return T_STRING;
+
+ /* Handle newlines and EOF */
+
+ case '\r':
+ if (peek(0) == '\n')
+ shiftChars(1); /* Shift that EOL */
+ /* fallthrough */
+ case '\n':
+ if (lexerStateEOL) {
+ lexer_SetState(lexerStateEOL);
+ lexerStateEOL = NULL;
+ }
+ return '\n';
+
case EOF:
/* Captures end at their buffer's boundary no matter what */
if (!lexerState->capturing) {
@@ -377,8 +1246,31 @@
}
return 0;
+ /* Handle identifiers... or error out */
+
default:
- error("Unknown character '%c'\n");
+ if ((c <= 'Z' && c >= 'A')
+ || (c <= 'z' && c >= 'a')
+ || c == '.' || c == '_') {
+ int tokenType = readIdentifier(c);
+
+ /* If a keyword, don't try to expand */
+ if (tokenType != T_ID && tokenType != T_LOCAL_ID)
+ return tokenType;
+
+ /* TODO: attempt string expansion */
+
+ if (tokenType == T_ID && lexerState->atLineStart)
+ return T_LABEL;
+
+ return tokenType;
+ }
+
+ /* Do not report weird characters when capturing, it'll be done later */
+ if (!lexerState->capturing) {
+ /* TODO: try to group reportings */
+ error("Unknown character %s\n", reportGarbageChar(c));
+ }
}
lexerState->atLineStart = false;
}
@@ -389,6 +1281,56 @@
fatalerror("LEXER_RAW not yet implemented\n");
}
+/*
+ * This function uses the fact that `if`, etc. constructs are only valid when
+ * there's nothing before them on their lines. This enables filtering
+ * "meaningful" (= at line start) vs. "meaningless" (everything else) tokens.
+ * It's especially important due to macro args not being handled in this
+ * state, and lexing them in "normal" mode potentially producing such tokens.
+ */
+static int skipIfBlock(bool toEndc)
+{
+ lexer_SetMode(LEXER_NORMAL);
+ int startingDepth = nIFDepth;
+ int token;
+
+ /* Prevent expanding macro args in this state by enabling capture to nothing */
+ lexerState->capturing = true;
+ lexerState->captureSize = 0;
+ lexerState->captureBuf = NULL;
+
+ for (;;) {
+ bool atLineStart = lexerState->atLineStart;
+
+ token = yylex();
+ if (token == 0) { /* Pass EOF through */
+ return token;
+ } else if (atLineStart && token == T_POP_IF) { /* Increase nesting */
+ nIFDepth++;
+ } else if (atLineStart && nIFDepth == startingDepth) { /* An occasion to finish? */
+ if (token == T_POP_ENDC || (!toEndc && (token == T_POP_ELIF
+ || token == T_POP_ELSE)))
+ break;
+ } else if (atLineStart && token == T_POP_ENDC) { /* Decrease nesting */
+ nIFDepth--;
+ }
+ }
+
+ lexerState->capturing = false;
+
+ return token;
+}
+
+static int yylex_SKIP_TO_ELIF(void)
+{
+ return skipIfBlock(false);
+}
+
+static int yylex_SKIP_TO_ENDC(void)
+{
+ return skipIfBlock(true);
+}
+
int yylex(void)
{
if (lexerState->atLineStart) {
@@ -397,21 +1339,27 @@
}
static int (* const lexerModeFuncs[])(void) = {
- [LEXER_NORMAL] = yylex_NORMAL,
- [LEXER_RAW] = yylex_RAW,
+ [LEXER_NORMAL] = yylex_NORMAL,
+ [LEXER_RAW] = yylex_RAW,
+ [LEXER_SKIP_TO_ELIF] = yylex_SKIP_TO_ELIF,
+ [LEXER_SKIP_TO_ENDC] = yylex_SKIP_TO_ENDC
};
int token = lexerModeFuncs[lexerState->mode]();
+ /* Make sure to terminate files with a line feed */
+ if (token == 0 && lexerState->lastToken != '\n')
+ token = '\n';
+ lexerState->lastToken = token;
+
+ lexerState->atLineStart = false;
if (token == '\n')
lexerState->atLineStart = true;
- else if (lexerState->atLineStart)
- lexerState->atLineStart = false;
return token;
}
-void lexer_SkipToBlockEnd(int blockStartToken, int blockEndToken, int endToken,
- char const **capture, size_t *size, char const *name)
+void lexer_CaptureBlock(int blockStartToken, int blockEndToken, char const **capture, size_t *size,
+ char const *name)
{
lexerState->capturing = true;
lexerState->captureSize = 0;
@@ -418,30 +1366,19 @@
unsigned int level = 0;
char *captureStart;
- if (capture) {
- if (lexerState->isMmapped) {
- captureStart = lexerState->ptr;
- } else {
- lexerState->captureCapacity = 128; /* The initial size will be twice that */
- reallocCaptureBuf();
- captureStart = lexerState->captureBuf;
- }
+ if (lexerState->isMmapped) {
+ captureStart = lexerState->ptr;
+ } else {
+ lexerState->captureCapacity = 128; /* The initial size will be twice that */
+ reallocCaptureBuf();
+ captureStart = lexerState->captureBuf;
}
for (;;) {
int token = yylex();
- if (level == 0) {
- if (token == endToken)
- break;
- /*
- * Hack: skipping after a `if` requires stopping on three different tokens,
- * which there is no simple way to make this function support. Instead,
- * if ELIF is the end token, ELSE and ENDC are also checked for here.
- */
- if (endToken == T_POP_ELIF && (token == T_POP_ELSE || token == T_POP_ENDC))
- break;
- }
+ if (level == 0 && token == blockEndToken)
+ break;
if (token == EOF)
error("Unterminated %s\n", name);
@@ -451,9 +1388,7 @@
level--;
}
- if (capture) {
- *capture = captureStart;
- *size = lexerState->captureSize;
- }
+ *capture = captureStart;
+ *size = lexerState->captureSize;
lexerState->captureBuf = NULL;
}
--- a/src/asm/main.c
+++ b/src/asm/main.c
@@ -488,6 +488,7 @@
if (!state)
fatalerror("Failed to open main file!\n");
+ lexer_Init();
lexer_SetState(state);
nStartClock = clock();
--- a/src/asm/symbol.c
+++ b/src/asm/symbol.c
@@ -210,8 +210,6 @@
labelScope = NULL;
hash_RemoveElement(symbols, symbol->name);
- if (symbol->type == SYM_MACRO)
- free(symbol->macro);
free(symbol);
}
}
@@ -230,8 +228,23 @@
}
/*
- * Return a constant symbols value
+ * Return a constant symbol's value, assuming it's defined
*/
+uint32_t sym_GetConstantSymValue(struct Symbol const *sym)
+{
+ if (sym == PCSymbol)
+ return sym_GetPCValue();
+ else if (!sym_IsConstant(sym))
+ error("\"%s\" does not have a constant value\n", sym->name);
+ else
+ return sym_GetValue(sym);
+
+ return 0;
+}
+
+/*
+ * Return a constant symbol's value
+ */
uint32_t sym_GetConstantValue(char const *s)
{
struct Symbol const *sym = sym_FindSymbol(s);
@@ -238,12 +251,8 @@
if (sym == NULL)
error("'%s' not defined\n", s);
- else if (sym == PCSymbol)
- return sym_GetPCValue();
- else if (!sym_IsConstant(sym))
- error("\"%s\" does not have a constant value\n", s);
else
- return sym_GetValue(sym);
+ return sym_GetConstantSymValue(sym);
return 0;
}
@@ -468,13 +477,13 @@
/*
* Add a macro definition
*/
-struct Symbol *sym_AddMacro(char const *symName, int32_t defLineNo)
+struct Symbol *sym_AddMacro(char const *symName, int32_t defLineNo, char const *body, size_t size)
{
struct Symbol *sym = createNonrelocSymbol(symName);
sym->type = SYM_MACRO;
- sym->macroSize = ulNewMacroSize;
- sym->macro = tzNewMacro;
+ sym->macroSize = size;
+ sym->macro = body;
updateSymbolFilename(sym);
/*
* The symbol is created at the line after the `endm`,
--- a/src/asm/util.c
+++ b/src/asm/util.c
@@ -6,6 +6,7 @@
* SPDX-License-Identifier: MIT
*/
+#include <ctype.h>
#include <stdint.h>
#include "asm/main.h"
@@ -25,6 +26,37 @@
hash = (hash * 33) ^ (*s++);
return hash;
+}
+
+char const *print(char c)
+{
+ static char buf[5]; /* '\xNN' + '\0' */
+
+ if (isprint(c)) {
+ buf[0] = c;
+ buf[1] = '\0';
+ return buf;
+ }
+
+ buf[0] = '\\';
+ switch (c) {
+ case '\n':
+ buf[1] = 'n';
+ break;
+ case '\r':
+ buf[1] = 'r';
+ break;
+ case '\t':
+ buf[1] = 't';
+ break;
+
+ default: /* Print as hex */
+ buf[1] = 'x';
+ sprintf(&buf[2], "%02hhx", c);
+ return buf;
+ }
+ buf[2] = '\0';
+ return buf;
}
size_t readUTF8Char(uint8_t *dest, char const *src)
--- a/src/asm/warning.c
+++ b/src/asm/warning.c
@@ -198,14 +198,14 @@
warnx("Unknown warning `%s`", flag);
}
-void verror(const char *fmt, va_list args, char const *flag)
+void printDiag(const char *fmt, va_list args, char const *type,
+ char const *flagfmt, char const *flag)
{
- fputs("ERROR: ", stderr);
+ fputs(type, stderr);
fstk_Dump();
- fprintf(stderr, flag ? ": [-Werror=%s]\n " : ":\n ", flag);
+ fprintf(stderr, flagfmt, flag);
vfprintf(stderr, fmt, args);
lexer_DumpStringExpansions();
- nbErrors++;
}
void error(const char *fmt, ...)
@@ -213,8 +213,9 @@
va_list args;
va_start(args, fmt);
- verror(fmt, args, NULL);
+ printDiag(fmt, args, "ERROR: ", "\n ", NULL);
va_end(args);
+ nbErrors++;
}
noreturn_ void fatalerror(const char *fmt, ...)
@@ -222,7 +223,7 @@
va_list args;
va_start(args, fmt);
- verror(fmt, args, NULL);
+ printDiag(fmt, args, "FATAL: ", "\n ", NULL);
va_end(args);
exit(1);
@@ -240,7 +241,7 @@
return;
case WARNING_ERROR:
- verror(fmt, args, flag);
+ printDiag(fmt, args, "ERROR: ", "[-Werror=%s]\n ", flag);
va_end(args);
return;
@@ -252,11 +253,7 @@
break;
}
- fputs("warning: ", stderr);
- fstk_Dump();
- fprintf(stderr, ": [-W%s]\n ", flag);
- vfprintf(stderr, fmt, args);
- lexer_DumpStringExpansions();
+ printDiag(fmt, args, "warning: ", "[-W%s]\n ", flag);
va_end(args);
}