From 1269fe6c0e1d9fa22062b8aa079b5c877385e65c Mon Sep 17 00:00:00 2001
From: MascaraSnake <jonassauer27@gmail.com>
Date: Sat, 8 Jan 2022 13:17:15 +0100
Subject: [PATCH] Implement faster textmap parser

---
 src/doomdef.h |   7 ++-
 src/m_misc.c  | 166 +++++++++++++++++++++++++++++++++++++++++++++++---
 src/p_setup.c |  71 ++++++++++-----------
 3 files changed, 194 insertions(+), 50 deletions(-)

diff --git a/src/doomdef.h b/src/doomdef.h
index 41ec9ef33f..1b86cc9548 100644
--- a/src/doomdef.h
+++ b/src/doomdef.h
@@ -483,8 +483,11 @@ extern void *(*M_Memcpy)(void* dest, const void* src, size_t n) FUNCNONNULL;
 char *va(const char *format, ...) FUNCPRINTF;
 char *M_GetToken(const char *inputString);
 void M_UnGetToken(void);
-UINT32 M_GetTokenPos(void);
-void M_SetTokenPos(UINT32 newPos);
+void M_TokenizerOpen(const char *inputString);
+void M_TokenizerClose(void);
+const char *M_TokenizerRead(UINT32 i);
+UINT32 M_TokenizerGetEndPos(void);
+void M_TokenizerSetEndPos(UINT32 newPos);
 char *sizeu1(size_t num);
 char *sizeu2(size_t num);
 char *sizeu3(size_t num);
diff --git a/src/m_misc.c b/src/m_misc.c
index 59783d5d30..d49307c7fe 100644
--- a/src/m_misc.c
+++ b/src/m_misc.c
@@ -1970,18 +1970,168 @@ void M_UnGetToken(void)
 	endPos = oldendPos;
 }
 
-/** Returns the current token's position.
- */
-UINT32 M_GetTokenPos(void)
+#define NUMTOKENS 2
+static const char *tokenizerInput = NULL;
+static UINT32 tokenCapacity[NUMTOKENS] = {0};
+static char *tokenizerToken[NUMTOKENS] = {NULL};
+static UINT32 tokenizerStartPos = 0;
+static UINT32 tokenizerEndPos = 0;
+static UINT32 tokenizerInputLength = 0;
+static UINT8 tokenizerInComment = 0; // 0 = not in comment, 1 = // Single-line, 2 = /* Multi-line */
+
+void M_TokenizerOpen(const char *inputString)
 {
-	return endPos;
+	size_t i;
+
+	tokenizerInput = inputString;
+	for (i = 0; i < NUMTOKENS; i++)
+	{
+		tokenCapacity[i] = 1024;
+		tokenizerToken[i] = (char*)Z_Malloc(tokenCapacity[i] * sizeof(char), PU_STATIC, NULL);
+	}
+	tokenizerInputLength = strlen(tokenizerInput);
 }
 
-/** Sets the current token's position.
- */
-void M_SetTokenPos(UINT32 newPos)
+void M_TokenizerClose(void)
+{
+	size_t i;
+
+	tokenizerInput = NULL;
+	for (i = 0; i < NUMTOKENS; i++)
+		Z_Free(tokenizerToken[i]);
+	tokenizerStartPos = 0;
+	tokenizerEndPos = 0;
+	tokenizerInComment = 0;
+}
+
+static void M_DetectComment(UINT32 *pos)
+{
+	if (tokenizerInComment)
+		return;
+
+	if (*pos >= tokenizerInputLength - 1)
+		return;
+
+	if (tokenizerInput[*pos] != '/')
+		return;
+
+	//Single-line comment start
+	if (tokenizerInput[*pos + 1] == '/')
+		tokenizerInComment = 1;
+	//Multi-line comment start
+	else if (tokenizerInput[*pos + 1] == '*')
+		tokenizerInComment = 2;
+}
+
+static void M_ReadTokenString(UINT32 i)
+{
+	UINT32 tokenLength = tokenizerEndPos - tokenizerStartPos;
+	if (tokenLength + 1 > tokenCapacity[i])
+	{
+		tokenCapacity[i] = tokenLength + 1;
+		// Assign the memory. Don't forget an extra byte for the end of the string!
+		tokenizerToken[i] = (char *)Z_Malloc(tokenCapacity[i] * sizeof(char), PU_STATIC, NULL);
+	}
+	// Copy the string.
+	M_Memcpy(tokenizerToken[i], tokenizerInput + tokenizerStartPos, (size_t)tokenLength);
+	// Make the final character NUL.
+	tokenizerToken[i][tokenLength] = '\0';
+}
+
+const char *M_TokenizerRead(UINT32 i)
+{
+	if (!tokenizerInput)
+		return NULL;
+
+	tokenizerStartPos = tokenizerEndPos;
+
+	// Try to detect comments now, in case we're pointing right at one
+	M_DetectComment(&tokenizerStartPos);
+
+	// Find the first non-whitespace char, or else the end of the string trying
+	while ((tokenizerInput[tokenizerStartPos] == ' '
+			|| tokenizerInput[tokenizerStartPos] == '\t'
+			|| tokenizerInput[tokenizerStartPos] == '\r'
+			|| tokenizerInput[tokenizerStartPos] == '\n'
+			|| tokenizerInput[tokenizerStartPos] == '\0'
+			|| tokenizerInput[tokenizerStartPos] == '=' || tokenizerInput[tokenizerStartPos] == ';' // UDMF TEXTMAP.
+			|| tokenizerInComment != 0)
+			&& tokenizerStartPos < tokenizerInputLength)
+	{
+		// Try to detect comment endings now
+		if (tokenizerInComment == 1	&& tokenizerInput[tokenizerStartPos] == '\n')
+			tokenizerInComment = 0; // End of line for a single-line comment
+		else if (tokenizerInComment == 2
+			&& tokenizerStartPos < tokenizerInputLength - 1
+			&& tokenizerInput[tokenizerStartPos] == '*'
+			&& tokenizerInput[tokenizerStartPos+1] == '/')
+		{
+			// End of multi-line comment
+			tokenizerInComment = 0;
+			tokenizerStartPos++; // Make damn well sure we're out of the comment ending at the end of it all
+		}
+
+		tokenizerStartPos++;
+		M_DetectComment(&tokenizerStartPos);
+	}
+
+	// If the end of the string is reached, no token is to be read
+	if (tokenizerStartPos == tokenizerInputLength) {
+		tokenizerEndPos = tokenizerInputLength;
+		return NULL;
+	}
+	// Else, if it's one of these three symbols, capture only this one character
+	else if (tokenizerInput[tokenizerStartPos] == ','
+			|| tokenizerInput[tokenizerStartPos] == '{'
+			|| tokenizerInput[tokenizerStartPos] == '}')
+	{
+		tokenizerEndPos = tokenizerStartPos + 1;
+		tokenizerToken[i][0] = tokenizerInput[tokenizerStartPos];
+		tokenizerToken[i][1] = '\0';
+		return tokenizerToken[i];
+	}
+	// Return entire string within quotes, except without the quotes.
+	else if (tokenizerInput[tokenizerStartPos] == '"')
+	{
+		tokenizerEndPos = ++tokenizerStartPos;
+		while (tokenizerInput[tokenizerEndPos] != '"' && tokenizerEndPos < tokenizerInputLength)
+			tokenizerEndPos++;
+
+		M_ReadTokenString(i);
+		tokenizerEndPos++;
+		return tokenizerToken[i];
+	}
+
+	// Now find the end of the token. This includes several additional characters that are okay to capture as one character, but not trailing at the end of another token.
+	tokenizerEndPos = tokenizerStartPos + 1;
+	while ((tokenizerInput[tokenizerEndPos] != ' '
+			&& tokenizerInput[tokenizerEndPos] != '\t'
+			&& tokenizerInput[tokenizerEndPos] != '\r'
+			&& tokenizerInput[tokenizerEndPos] != '\n'
+			&& tokenizerInput[tokenizerEndPos] != ','
+			&& tokenizerInput[tokenizerEndPos] != '{'
+			&& tokenizerInput[tokenizerEndPos] != '}'
+			&& tokenizerInput[tokenizerEndPos] != '=' && tokenizerInput[tokenizerEndPos] != ';' // UDMF TEXTMAP.
+			&& tokenizerInComment == 0)
+			&& tokenizerEndPos < tokenizerInputLength)
+	{
+		tokenizerEndPos++;
+		// Try to detect comment starts now; if it's in a comment, we don't want it in this token
+		M_DetectComment(&tokenizerEndPos);
+	}
+
+	M_ReadTokenString(i);
+	return tokenizerToken[i];
+}
+
+UINT32 M_TokenizerGetEndPos(void)
+{
+	return tokenizerEndPos;
+}
+
+void M_TokenizerSetEndPos(UINT32 newPos)
 {
-	endPos = newPos;
+	tokenizerEndPos = newPos;
 }
 
 /** Count bits in a number.
diff --git a/src/p_setup.c b/src/p_setup.c
index c6561a8a88..74f761f24a 100644
--- a/src/p_setup.c
+++ b/src/p_setup.c
@@ -1428,9 +1428,9 @@ UINT32 vertexesPos[UINT16_MAX];
 UINT32 sectorsPos[UINT16_MAX];
 
 // Determine total amount of map data in TEXTMAP.
-static boolean TextmapCount(UINT8 *data, size_t size)
+static boolean TextmapCount(size_t size)
 {
-	char *tkn = M_GetToken((char *)data);
+	const char *tkn = M_TokenizerRead(0);
 	UINT8 brackets = 0;
 
 	nummapthings = 0;
@@ -1442,20 +1442,16 @@ static boolean TextmapCount(UINT8 *data, size_t size)
 	// Look for namespace at the beginning.
 	if (!fastcmp(tkn, "namespace"))
 	{
-		Z_Free(tkn);
 		CONS_Alert(CONS_ERROR, "No namespace at beginning of lump!\n");
 		return false;
 	}
-	Z_Free(tkn);
 
 	// Check if namespace is valid.
-	tkn = M_GetToken(NULL);
+	tkn = M_TokenizerRead(0);
 	if (!fastcmp(tkn, "srb2"))
 		CONS_Alert(CONS_WARNING, "Invalid namespace '%s', only 'srb2' is supported.\n", tkn);
-	Z_Free(tkn);
 
-	tkn = M_GetToken(NULL);
-	while (tkn && M_GetTokenPos() < size)
+	while ((tkn = M_TokenizerRead(0)) && M_TokenizerGetEndPos() < size)
 	{
 		// Avoid anything inside bracketed stuff, only look for external keywords.
 		if (brackets)
@@ -1467,24 +1463,19 @@ static boolean TextmapCount(UINT8 *data, size_t size)
 			brackets++;
 		// Check for valid fields.
 		else if (fastcmp(tkn, "thing"))
-			mapthingsPos[nummapthings++] = M_GetTokenPos();
+			mapthingsPos[nummapthings++] = M_TokenizerGetEndPos();
 		else if (fastcmp(tkn, "linedef"))
-			linesPos[numlines++] = M_GetTokenPos();
+			linesPos[numlines++] = M_TokenizerGetEndPos();
 		else if (fastcmp(tkn, "sidedef"))
-			sidesPos[numsides++] = M_GetTokenPos();
+			sidesPos[numsides++] = M_TokenizerGetEndPos();
 		else if (fastcmp(tkn, "vertex"))
-			vertexesPos[numvertexes++] = M_GetTokenPos();
+			vertexesPos[numvertexes++] = M_TokenizerGetEndPos();
 		else if (fastcmp(tkn, "sector"))
-			sectorsPos[numsectors++] = M_GetTokenPos();
+			sectorsPos[numsectors++] = M_TokenizerGetEndPos();
 		else
 			CONS_Alert(CONS_NOTICE, "Unknown field '%s'.\n", tkn);
-
-		Z_Free(tkn);
-		tkn = M_GetToken(NULL);
 	}
 
-	Z_Free(tkn);
-
 	if (brackets)
 	{
 		CONS_Alert(CONS_ERROR, "Unclosed brackets detected in textmap lump.\n");
@@ -1494,7 +1485,7 @@ static boolean TextmapCount(UINT8 *data, size_t size)
 	return true;
 }
 
-static void ParseTextmapVertexParameter(UINT32 i, char *param, char *val)
+static void ParseTextmapVertexParameter(UINT32 i, const char *param, const char *val)
 {
 	if (fastcmp(param, "x"))
 		vertexes[i].x = FLOAT_TO_FIXED(atof(val));
@@ -1541,7 +1532,7 @@ typedef struct textmap_plane_s {
 textmap_plane_t textmap_planefloor = {0, 0, 0, 0, 0};
 textmap_plane_t textmap_planeceiling = {0, 0, 0, 0, 0};
 
-static void ParseTextmapSectorParameter(UINT32 i, char *param, char *val)
+static void ParseTextmapSectorParameter(UINT32 i, const char *param, const char *val)
 {
 	if (fastcmp(param, "heightfloor"))
 		sectors[i].floorheight = atol(val) << FRACBITS;
@@ -1565,7 +1556,7 @@ static void ParseTextmapSectorParameter(UINT32 i, char *param, char *val)
 		Tag_FSet(&sectors[i].tags, atol(val));
 	else if (fastcmp(param, "moreids"))
 	{
-		char* id = val;
+		const char* id = val;
 		while (id)
 		{
 			Tag_Add(&sectors[i].tags, atol(id));
@@ -1754,7 +1745,7 @@ static void ParseTextmapSectorParameter(UINT32 i, char *param, char *val)
 		sectors[i].triggerer = atol(val);
 }
 
-static void ParseTextmapSidedefParameter(UINT32 i, char *param, char *val)
+static void ParseTextmapSidedefParameter(UINT32 i, const char *param, const char *val)
 {
 	if (fastcmp(param, "offsetx"))
 		sides[i].textureoffset = atol(val)<<FRACBITS;
@@ -1772,13 +1763,13 @@ static void ParseTextmapSidedefParameter(UINT32 i, char *param, char *val)
 		sides[i].repeatcnt = atol(val);
 }
 
-static void ParseTextmapLinedefParameter(UINT32 i, char *param, char *val)
+static void ParseTextmapLinedefParameter(UINT32 i, const char *param, const char *val)
 {
 	if (fastcmp(param, "id"))
 		Tag_FSet(&lines[i].tags, atol(val));
 	else if (fastcmp(param, "moreids"))
 	{
-		char* id = val;
+		const char* id = val;
 		while (id)
 		{
 			Tag_Add(&lines[i].tags, atol(id));
@@ -1866,13 +1857,13 @@ static void ParseTextmapLinedefParameter(UINT32 i, char *param, char *val)
 		lines[i].flags |= ML_TFERLINE;
 }
 
-static void ParseTextmapThingParameter(UINT32 i, char *param, char *val)
+static void ParseTextmapThingParameter(UINT32 i, const char *param, const char *val)
 {
 	if (fastcmp(param, "id"))
 		Tag_FSet(&mapthings[i].tags, atol(val));
 	else if (fastcmp(param, "moreids"))
 	{
-		char* id = val;
+		const char* id = val;
 		while (id)
 		{
 			Tag_Add(&mapthings[i].tags, atol(id));
@@ -1923,32 +1914,25 @@ static void ParseTextmapThingParameter(UINT32 i, char *param, char *val)
   * \param Structure number (mapthings, sectors, ...).
   * \param Parser function pointer.
   */
-static void TextmapParse(UINT32 dataPos, size_t num, void (*parser)(UINT32, char *, char *))
+static void TextmapParse(UINT32 dataPos, size_t num, void (*parser)(UINT32, const char *, const char *))
 {
-	char *param, *val;
+	const char *param, *val;
 
-	M_SetTokenPos(dataPos);
-	param = M_GetToken(NULL);
+	M_TokenizerSetEndPos(dataPos);
+	param = M_TokenizerRead(0);
 	if (!fastcmp(param, "{"))
 	{
-		Z_Free(param);
 		CONS_Alert(CONS_WARNING, "Invalid UDMF data capsule!\n");
 		return;
 	}
-	Z_Free(param);
 
 	while (true)
 	{
-		param = M_GetToken(NULL);
+		param = M_TokenizerRead(0);
 		if (fastcmp(param, "}"))
-		{
-			Z_Free(param);
 			break;
-		}
-		val = M_GetToken(NULL);
+		val = M_TokenizerRead(1);
 		parser(num, param, val);
-		Z_Free(param);
-		Z_Free(val);
 	}
 }
 
@@ -2649,8 +2633,12 @@ static boolean P_LoadMapData(const virtres_t *virt)
 	if (udmf) // Count how many entries for each type we got in textmap.
 	{
 		virtlump_t *textmap = vres_Find(virt, "TEXTMAP");
-		if (!TextmapCount(textmap->data, textmap->size))
+		M_TokenizerOpen((char *)textmap->data);
+		if (!TextmapCount(textmap->size))
+		{
+			M_TokenizerClose();
 			return false;
+		}
 	}
 	else
 	{
@@ -2704,7 +2692,10 @@ static boolean P_LoadMapData(const virtres_t *virt)
 
 	// Load map data.
 	if (udmf)
+	{
 		P_LoadTextmap();
+		M_TokenizerClose();
+	}
 	else
 	{
 		P_LoadVertices(virtvertexes->data);
-- 
GitLab