Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
SRB2
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
STJr
SRB2
Commits
64047541
Commit
64047541
authored
3 months ago
by
Lactozilla
Browse files
Options
Downloads
Patches
Plain Diff
Make tokenizer aware of escape sequences
parent
20d02d35
Branches
Branches containing commit
Tags
Tags containing commit
1 merge request
!2647
Make UDMF parser aware of escape sequences
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/m_tokenizer.c
+162
-14
162 additions, 14 deletions
src/m_tokenizer.c
src/m_tokenizer.h
+2
-1
2 additions, 1 deletion
src/m_tokenizer.h
with
164 additions
and
15 deletions
src/m_tokenizer.c
+
162
−
14
View file @
64047541
// SONIC ROBO BLAST 2
//-----------------------------------------------------------------------------
// Copyright (C) 2013-202
4
by Sonic Team Junior.
// Copyright (C) 2013-202
5
by Sonic Team Junior.
//
// This program is free software distributed under the
// terms of the GNU General Public License, version 2.
...
...
@@ -28,6 +28,7 @@ tokenizer_t *Tokenizer_Open(const char *inputString, size_t len, unsigned numTok
tokenizer
->
endPos
=
0
;
tokenizer
->
inputLength
=
0
;
tokenizer
->
inComment
=
0
;
tokenizer
->
stringNeedsEscaping
=
false
;
tokenizer
->
inString
=
0
;
tokenizer
->
get
=
Tokenizer_Read
;
...
...
@@ -92,6 +93,124 @@ static void DetectComment(tokenizer_t *tokenizer, UINT32 *pos)
tokenizer
->
inComment
=
2
;
}
// This function detects escape sequences in a string and attempts to convert them.
static
size_t
EscapeString
(
char
*
output
,
const
char
*
input
,
size_t
inputLength
)
{
const
char
*
end
=
input
+
inputLength
;
size_t
i
=
0
;
while
(
input
<
end
)
{
char
chr
=
*
input
++
;
if
(
chr
==
'\\'
)
{
chr
=
*
input
++
;
switch
(
chr
)
{
case
'n'
:
output
[
i
]
=
'\n'
;
i
++
;
break
;
case
't'
:
output
[
i
]
=
'\t'
;
i
++
;
break
;
case
'\\'
:
output
[
i
]
=
'\\'
;
i
++
;
break
;
case
'"'
:
output
[
i
]
=
'\"'
;
i
++
;
break
;
case
'x'
:
{
int
out
=
0
,
c
;
int
j
=
0
;
chr
=
*
input
++
;
for
(
j
=
0
;
j
<
5
&&
isxdigit
(
chr
);
j
++
)
{
c
=
((
chr
<=
'9'
)
?
(
chr
-
'0'
)
:
(
tolower
(
chr
)
-
'a'
+
10
));
out
=
(
out
<<
4
)
|
c
;
chr
=
*
input
++
;
}
input
--
;
switch
(
j
)
{
case
4
:
output
[
i
]
=
(
out
>>
8
)
&
0xFF
;
i
++
;
/* FALLTHRU */
case
2
:
output
[
i
]
=
out
&
0xFF
;
i
++
;
break
;
default:
// TODO: Displaying parsing errors properly will require
// some refactoring of the tokenizer itself. For now,
// this function will silently return an empty string
// if it encounters a malformed escape sequence.
// This situation cannot happen for i.e. UDMF comments,
// so it's okay to do this right now.
// CONS_Alert(CONS_WARNING, "Escape sequence has wrong size\n");
i
=
0
;
goto
done
;
}
break
;
}
default:
if
(
isdigit
(
chr
))
{
int
out
=
0
;
int
j
=
0
;
do
{
out
=
10
*
out
+
(
chr
-
'0'
);
chr
=
*
input
++
;
}
while
(
++
j
<
3
&&
isdigit
(
chr
));
input
--
;
if
(
out
>
255
)
{
// CONS_Alert(CONS_WARNING, "Escape sequence is too large\n");
i
=
0
;
goto
done
;
}
output
[
i
]
=
out
;
i
++
;
}
else
{
// CONS_Alert(CONS_WARNING, "Unknown escape sequence '\\%c'\n", chr);
i
=
0
;
goto
done
;
}
break
;
}
}
else
{
output
[
i
]
=
chr
;
i
++
;
}
}
done:
output
[
i
]
=
'\0'
;
i
++
;
return
i
;
}
static
void
Tokenizer_ReadTokenString
(
tokenizer_t
*
tokenizer
,
UINT32
i
)
{
UINT32
tokenLength
=
tokenizer
->
endPos
-
tokenizer
->
startPos
;
...
...
@@ -101,10 +220,46 @@ static void Tokenizer_ReadTokenString(tokenizer_t *tokenizer, UINT32 i)
// Assign the memory. Don't forget an extra byte for the end of the string!
tokenizer
->
token
[
i
]
=
(
char
*
)
Z_Malloc
(
tokenizer
->
capacity
[
i
]
*
sizeof
(
char
),
PU_STATIC
,
NULL
);
}
// Copy the string.
M_Memcpy
(
tokenizer
->
token
[
i
],
tokenizer
->
input
+
tokenizer
->
startPos
,
(
size_t
)
tokenLength
);
// Make the final character NUL.
tokenizer
->
token
[
i
][
tokenLength
]
=
'\0'
;
if
(
tokenizer
->
stringNeedsEscaping
)
{
EscapeString
(
tokenizer
->
token
[
i
],
tokenizer
->
input
+
tokenizer
->
startPos
,
(
size_t
)
tokenLength
);
}
else
{
M_Memcpy
(
tokenizer
->
token
[
i
],
tokenizer
->
input
+
tokenizer
->
startPos
,
(
size_t
)
tokenLength
);
// Make the final character NUL.
tokenizer
->
token
[
i
][
tokenLength
]
=
'\0'
;
}
}
static
void
ScanString
(
tokenizer_t
*
tokenizer
)
{
tokenizer
->
stringNeedsEscaping
=
false
;
while
(
tokenizer
->
input
[
tokenizer
->
endPos
]
!=
'"'
&&
tokenizer
->
endPos
<
tokenizer
->
inputLength
)
{
if
(
!
DetectLineBreak
(
tokenizer
,
tokenizer
->
endPos
))
{
// Skip one character ahead if this looks like an escape sequence
if
(
tokenizer
->
input
[
tokenizer
->
endPos
]
==
'\\'
)
{
tokenizer
->
stringNeedsEscaping
=
true
;
tokenizer
->
endPos
++
;
// Oh. Naughty. We hit the end of the input.
// Stop scanning, then.
if
(
tokenizer
->
endPos
==
tokenizer
->
inputLength
)
return
;
DetectLineBreak
(
tokenizer
,
tokenizer
->
endPos
);
}
}
tokenizer
->
endPos
++
;
}
}
const
char
*
Tokenizer_Read
(
tokenizer_t
*
tokenizer
,
UINT32
i
)
...
...
@@ -117,11 +272,7 @@ const char *Tokenizer_Read(tokenizer_t *tokenizer, UINT32 i)
// If in a string, return the entire string within quotes, except without the quotes.
if
(
tokenizer
->
inString
==
1
)
{
while
(
tokenizer
->
input
[
tokenizer
->
endPos
]
!=
'"'
&&
tokenizer
->
endPos
<
tokenizer
->
inputLength
)
{
DetectLineBreak
(
tokenizer
,
tokenizer
->
endPos
);
tokenizer
->
endPos
++
;
}
ScanString
(
tokenizer
);
Tokenizer_ReadTokenString
(
tokenizer
,
i
);
tokenizer
->
inString
=
2
;
...
...
@@ -134,6 +285,7 @@ const char *Tokenizer_Read(tokenizer_t *tokenizer, UINT32 i)
tokenizer
->
token
[
i
][
0
]
=
tokenizer
->
input
[
tokenizer
->
startPos
];
tokenizer
->
token
[
i
][
1
]
=
'\0'
;
tokenizer
->
inString
=
0
;
tokenizer
->
stringNeedsEscaping
=
false
;
return
tokenizer
->
token
[
i
];
}
...
...
@@ -281,11 +433,7 @@ const char *Tokenizer_SRB2Read(tokenizer_t *tokenizer, UINT32 i)
else
if
(
tokenizer
->
input
[
tokenizer
->
startPos
]
==
'"'
)
{
tokenizer
->
endPos
=
++
tokenizer
->
startPos
;
while
(
tokenizer
->
input
[
tokenizer
->
endPos
]
!=
'"'
&&
tokenizer
->
endPos
<
tokenizer
->
inputLength
)
{
DetectLineBreak
(
tokenizer
,
tokenizer
->
endPos
);
tokenizer
->
endPos
++
;
}
ScanString
(
tokenizer
);
Tokenizer_ReadTokenString
(
tokenizer
,
i
);
tokenizer
->
endPos
++
;
...
...
This diff is collapsed.
Click to expand it.
src/m_tokenizer.h
+
2
−
1
View file @
64047541
// SONIC ROBO BLAST 2
//-----------------------------------------------------------------------------
// Copyright (C) 2013-202
4
by Sonic Team Junior.
// Copyright (C) 2013-202
5
by Sonic Team Junior.
//
// This program is free software distributed under the
// terms of the GNU General Public License, version 2.
...
...
@@ -26,6 +26,7 @@ typedef struct Tokenizer
UINT32
inputLength
;
UINT8
inComment
;
// 0 = not in comment, 1 = // Single-line, 2 = /* Multi-line */
UINT8
inString
;
// 0 = not in string, 1 = in string, 2 = just left string
boolean
stringNeedsEscaping
;
int
line
;
const
char
*
(
*
get
)(
struct
Tokenizer
*
,
UINT32
);
}
tokenizer_t
;
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment