ref: e7dc094e568bf05688fd729e9a266ee02d2a2607
parent: 3cd1d46a1bf66df5fafe04dde885cc6f3dfcdb28
author: jidoc01 <[email protected]>
date: Mon Jun 24 20:58:47 EDT 2019
Improve charmap structure with trie Charmap's previous structure was using brute-force comparison for converting the strings in source files. It always compared given strings to all of the strings in charmap, which was very costly in huge projects. For its improvement, I changed its structure into trie, which is being used in many string-processing areas. It's now much faster than before.
--- a/include/asm/charmap.h
+++ b/include/asm/charmap.h
@@ -13,14 +13,25 @@
#define MAXCHARMAPS 512
#define CHARMAPLENGTH 16
+#define MAXCHARNODES (MAXCHARMAPS * CHARMAPLENGTH + 1)
+/*
+ * A node for trie structure.
+ */
+struct Charnode {
+ uint8_t code; /* the value in a key-value pair. */
+ uint8_t isCode; /* has one if it's a code node, not just a bridge node. */
+ struct Charnode *next[256]; /* each index representing the next possible character from its current state. */
+};
+
struct Charmap {
- int32_t count;
- char input[MAXCHARMAPS][CHARMAPLENGTH + 1];
- char output[MAXCHARMAPS];
+ int32_t charCount; /* user-side count. */
+ int32_t nodeCount; /* node-side count. */
+ struct Charnode nodes[MAXCHARNODES]; /* first node is reserved for the root node in charmap. */
};
int32_t readUTF8Char(char *destination, char *source);
+
int32_t charmap_Add(char *input, uint8_t output);
int32_t charmap_Convert(char **input);
--- a/src/asm/charmap.c
+++ b/src/asm/charmap.c
@@ -42,11 +42,10 @@
int32_t charmap_Add(char *input, uint8_t output)
{
int32_t i;
- size_t input_length;
- char temp1i[CHARMAPLENGTH + 1], temp2i[CHARMAPLENGTH + 1];
- char temp1o = 0, temp2o = 0;
+ uint8_t v;
- struct Charmap *charmap;
+ struct Charmap *charmap;
+ struct Charnode *curr_node, *temp_node;
if (pCurrentSection) {
if (pCurrentSection->charmap) {
@@ -55,7 +54,6 @@
charmap = calloc(1, sizeof(struct Charmap));
if (charmap == NULL)
fatalerror("Not enough memory for charmap");
-
pCurrentSection->charmap = charmap;
}
} else {
@@ -62,84 +60,103 @@
charmap = &globalCharmap;
}
- if (charmap->count > MAXCHARMAPS || strlen(input) > CHARMAPLENGTH)
+ if (charmap->charCount >= MAXCHARMAPS || strlen(input) > CHARMAPLENGTH)
return -1;
- input_length = strlen(input);
- if (input_length > 1) {
- i = 0;
- while (i < charmap->count + 1) {
- if (input_length > strlen(charmap->input[i])) {
- memcpy(temp1i, charmap->input[i],
- CHARMAPLENGTH + 1);
- memcpy(charmap->input[i], input, input_length);
- temp1o = charmap->output[i];
- charmap->output[i] = output;
- i++;
- break;
- }
- i++;
+ curr_node = &charmap->nodes[0];
+
+ for (i = 0; (v = (uint8_t)input[i]); i++) {
+ if (curr_node->next[v]) {
+ curr_node = curr_node->next[v];
+ } else {
+ temp_node = &charmap->nodes[charmap->nodeCount + 1];
+
+ curr_node->next[v] = temp_node;
+ curr_node = temp_node;
+
+ ++charmap->nodeCount;
}
- while (i < charmap->count + 1) {
- memcpy(temp2i, charmap->input[i], CHARMAPLENGTH + 1);
- memcpy(charmap->input[i], temp1i, CHARMAPLENGTH + 1);
- memcpy(temp1i, temp2i, CHARMAPLENGTH + 1);
- temp2o = charmap->output[i];
- charmap->output[i] = temp1o;
- temp1o = temp2o;
- i++;
- }
- memcpy(charmap->input[charmap->count + 1], temp1i,
- CHARMAPLENGTH + 1);
- charmap->output[charmap->count + 1] = temp1o;
- } else {
- memcpy(charmap->input[charmap->count], input, input_length);
- charmap->output[charmap->count] = output;
}
- return ++charmap->count;
+
+ /* prevent duplicated keys by accepting only first key-value pair. */
+ if (curr_node->isCode)
+ return charmap->charCount;
+
+ curr_node->code = output;
+ curr_node->isCode = 1;
+
+ return ++charmap->charCount;
}
int32_t charmap_Convert(char **input)
{
- struct Charmap *charmap;
+ struct Charmap *charmap;
+ struct Charnode *charnode;
- char outchar[CHARMAPLENGTH + 1];
- char *buffer;
- int32_t i, j, length;
+ char *output;
+ char outchar[8];
+ int32_t i, match, length;
+ uint8_t v, foundCode;
+
if (pCurrentSection && pCurrentSection->charmap)
charmap = pCurrentSection->charmap;
else
charmap = &globalCharmap;
- buffer = malloc(strlen(*input));
- if (buffer == NULL)
+ output = malloc(strlen(*input));
+ if (output == NULL)
fatalerror("Not enough memory for buffer");
length = 0;
+
while (**input) {
- j = 0;
- for (i = 0; i < charmap->count; i++) {
- j = strlen(charmap->input[i]);
- if (memcmp(*input, charmap->input[i], j) == 0) {
- outchar[0] = charmap->output[i];
- outchar[1] = 0;
+ charnode = &charmap->nodes[0];
+
+ /*
+ * find the longest valid match which has been registered in charmap.
+ * note that there could be either multiple matches or no match.
+ * and it possibly takes the longest match between them,
+ * which means that it ignores partial matches shorter than the longest one.
+ */
+ for (i = match = 0; (v = (*input)[i]);) {
+ if (!charnode->next[v])
break;
+
+ charnode = charnode->next[v];
+ i++;
+
+ if (charnode->isCode) {
+ match = i;
+ foundCode = charnode->code;
}
- j = 0;
}
- if (!j)
- j = readUTF8Char(outchar, *input);
+ if (match) {
+ output[length] = foundCode;
- if (!outchar[0]) {
- buffer[length++] = 0;
+ length += 1;
} else {
- for (i = 0; outchar[i]; i++)
- buffer[length++] = outchar[i];
+ /*
+ * put a utf-8 character
+ * if failed to find a match.
+ */
+ match = readUTF8Char(outchar, *input);
+
+ if (match) {
+ memcpy(output + length, *input, match);
+ } else {
+ output[length] = 0;
+ match = 1;
+ }
+
+ length += match;
}
- *input += j;
+
+ *input += match;
}
- *input = buffer;
+
+ *input = output;
+
return length;
}