Compare revisions

3a9987e3 · 06dfd360 · 06dfd360 · 06dfd360 · 06dfd360 · 06dfd360
--- a/src/sdl/i_threads.c
+++ b/src/sdl/i_threads.c
@@ -155,7 +155,7 @@ Worker (
 	return 0;
 }

-void
+I_thread_handle
 I_spawn_thread (
 		const char  * name,
 		I_thread_fn   entry,
@@ -189,6 +189,7 @@ I_spawn_thread (
 		}
 	}
 	I_unlock_mutex(i_thread_pool_mutex);
+	return (I_thread_handle)th;
 }

 int
@@ -354,3 +355,18 @@ I_wake_all_cond (
 	if (SDL_CondBroadcast(cond) == -1)
 		abort();
 }
+
+INT32
+I_atomic_load (
+		I_Atomicptr_t atomic
+){
+	return SDL_AtomicGet(atomic);
+}
+
+INT32
+I_atomic_exchange (
+	I_Atomicptr_t atomic,
+	INT32         val
+){
+	return SDL_AtomicSet(atomic, val);
+}
--- a/src/tmap.nas
+++ b/src/tmap.nas
-;; SONIC ROBO BLAST 2
-;;-----------------------------------------------------------------------------
-;; Copyright (C) 1998-2000 by DooM Legacy Team.
-;; Copyright (C) 1999-2021 by Sonic Team Junior.
-;;
-;; This program is free software distributed under the
-;; terms of the GNU General Public License, version 2.
-;; See the 'LICENSE' file for more details.
-;;-----------------------------------------------------------------------------
-;; FILE:
-;;      tmap.nas
-;; DESCRIPTION:
-;;      Assembler optimised rendering code for software mode.
-;;      Draw wall columns.
-
-
-[BITS 32]
-
-%define FRACBITS 16
-%define TRANSPARENTPIXEL 255
-
-%ifdef LINUX
-%macro cextern 1
-[extern %1]
-%endmacro
-
-%macro cglobal 1
-[global %1]
-%endmacro
-
-%else
-%macro cextern 1
-%define %1 _%1
-[extern %1]
-%endmacro
-
-%macro cglobal 1
-%define %1 _%1
-[global %1]
-%endmacro
-
-%endif
-
-
-; The viddef_s structure. We only need the width field.
-struc viddef_s
-        resb 12
-.width: resb 4
-        resb 44
-endstruc
-
-;; externs
-;; columns
-cextern dc_x
-cextern dc_yl
-cextern dc_yh
-cextern ylookup
-cextern columnofs
-cextern dc_source
-cextern dc_texturemid
-cextern dc_texheight
-cextern dc_iscale
-cextern dc_hires
-cextern centery
-cextern centeryfrac
-cextern dc_colormap
-cextern dc_transmap
-cextern colormaps
-cextern vid
-cextern topleft
-
-; DELME
-cextern R_DrawColumn_8
-
-; polygon edge rasterizer
-cextern prastertab
-
-[SECTION .data]
-
-;;.align        4
-loopcount       dd      0
-pixelcount      dd      0
-tystep          dd      0
-
-[SECTION .text]
-
-;;----------------------------------------------------------------------
-;;
-;; R_DrawColumn : 8bpp column drawer
-;;
-;; New  optimised version 10-01-1998 by D.Fabrice and P.Boris
-;; Revised by G. Dick July 2010 to support the intervening twelve years'
-;; worth of changes to the renderer. Since I only vaguely know what I'm
-;; doing, this is probably rather suboptimal. Help appreciated!
-;;
-;;----------------------------------------------------------------------
-;; fracstep, vid.width in memory
-;; eax = accumulator
-;; ebx = colormap
-;; ecx = count
-;; edx = heightmask
-;; esi = source
-;; edi = dest
-;; ebp = frac
-;;----------------------------------------------------------------------
-
-cglobal R_DrawColumn_8_ASM
-;       align   16
-R_DrawColumn_8_ASM:
-        push    ebp                     ;; preserve caller's stack frame pointer
-        push    esi                     ;; preserve register variables
-        push    edi
-        push    ebx
-;;
-;; dest = ylookup[dc_yl] + columnofs[dc_x];
-;;
-        mov     ebp,[dc_yl]
-        mov     edi,[ylookup+ebp*4]
-        mov     ebx,[dc_x]
-        add     edi,[columnofs+ebx*4]  ;; edi = dest
-;;
-;; pixelcount = yh - yl + 1
-;;
-        mov     ecx,[dc_yh]
-        add     ecx,1
-        sub     ecx,ebp                 ;; pixel count
-        jle     near .done              ;; nothing to scale
-;;
-;; fracstep = dc_iscale;	// But we just use [dc_iscale]
-;; frac = (dc_texturemid + FixedMul((dc_yl << FRACBITS) - centeryfrac, fracstep));
-;;
-        mov     eax,ebp                 ;; dc_yl
-        shl     eax,FRACBITS
-        sub     eax,[centeryfrac]
-        imul    dword [dc_iscale]
-        shrd    eax,edx,FRACBITS
-        add     eax,[dc_texturemid]
-        mov     ebp,eax                 ;; ebp = frac
-
-        mov     ebx,[dc_colormap]
-
-        mov     esi,[dc_source]
-;;
-;; if (dc_hires) frac = 0;
-;;
-        test    byte [dc_hires],0x01
-        jz      .texheightcheck
-        xor     ebp,ebp
-
-;;
-;; Check for power of two
-;;
-.texheightcheck:
-        mov     edx,[dc_texheight]
-        sub     edx,1                   ;; edx = heightmask
-        test    edx,[dc_texheight]
-        jnz     .notpowertwo
-
-        test    ecx,0x01                ;; Test for odd no. pixels
-        jnz     .odd
-
-;;
-;; Texture height is a power of two, so we get modular arithmetic by
-;; masking
-;;
-.powertwo:
-        mov     eax,ebp                 ;; eax = frac
-        sar     eax,FRACBITS            ;; Integer part
-        and     eax,edx                 ;; eax &= heightmask
-        movzx   eax,byte [esi + eax]    ;; eax = texel
-        add     ebp,[dc_iscale]         ;; frac += fracstep
-        movzx   eax,byte [ebx+eax]      ;; Map through colormap
-        mov     [edi],al                ;; Write pixel
-                                        ;; dest += vid.width
-        add     edi,[vid + viddef_s.width]
-
-.odd:
-        mov     eax,ebp                 ;; eax = frac
-        sar     eax,FRACBITS            ;; Integer part
-        and     eax,edx                 ;; eax &= heightmask
-        movzx   eax,byte [esi + eax]    ;; eax = texel
-        add     ebp,[dc_iscale]         ;; frac += fracstep
-        movzx   eax,byte [ebx+eax]      ;; Map through colormap
-        mov     [edi],al                ;; Write pixel
-                                        ;; dest += vid.width
-        add     edi,[vid + viddef_s.width]
-
-
-        sub     ecx,2                   ;; count -= 2
-        jg      .powertwo
-
-        jmp     .done
-
-.notpowertwo:
-        add     edx,1
-        shl     edx,FRACBITS
-        test    ebp,ebp
-        jns     .notpowtwoloop
-
-.makefracpos:
-        add     ebp,edx                 ;; frac is negative; make it positive
-        js      .makefracpos
-
-.notpowtwoloop:
-        cmp     ebp,edx                 ;; Reduce mod height
-        jl      .writenonpowtwo
-        sub     ebp,edx
-        jmp     .notpowtwoloop
-
-.writenonpowtwo:
-        mov     eax,ebp                 ;; eax = frac
-        sar     eax,FRACBITS            ;; Integer part.
-        mov     bl,[esi + eax]          ;; ebx = colormap + texel
-        add     ebp,[dc_iscale]         ;; frac += fracstep
-        movzx   eax,byte [ebx]          ;; Map through colormap
-        mov     [edi],al                ;; Write pixel
-                                        ;; dest += vid.width
-        add     edi,[vid + viddef_s.width]
-
-        sub     ecx,1
-        jnz     .notpowtwoloop
-
-;;
-
-.done:
-        pop     ebx                     ;; restore register variables
-        pop     edi
-        pop     esi
-        pop     ebp                     ;; restore caller's stack frame pointer
-        ret
-
-
-;;----------------------------------------------------------------------
-;;
-;; R_Draw2sMultiPatchColumn : Like R_DrawColumn, but omits transparent
-;;                            pixels.
-;;
-;; New  optimised version 10-01-1998 by D.Fabrice and P.Boris
-;; Revised by G. Dick July 2010 to support the intervening twelve years'
-;; worth of changes to the renderer. Since I only vaguely know what I'm
-;; doing, this is probably rather suboptimal. Help appreciated!
-;;
-;;----------------------------------------------------------------------
-;; fracstep, vid.width in memory
-;; eax = accumulator
-;; ebx = colormap
-;; ecx = count
-;; edx = heightmask
-;; esi = source
-;; edi = dest
-;; ebp = frac
-;;----------------------------------------------------------------------
-
-cglobal R_Draw2sMultiPatchColumn_8_ASM
-;       align   16
-R_Draw2sMultiPatchColumn_8_ASM:
-        push    ebp                     ;; preserve caller's stack frame pointer
-        push    esi                     ;; preserve register variables
-        push    edi
-        push    ebx
-;;
-;; dest = ylookup[dc_yl] + columnofs[dc_x];
-;;
-        mov     ebp,[dc_yl]
-        mov     edi,[ylookup+ebp*4]
-        mov     ebx,[dc_x]
-        add     edi,[columnofs+ebx*4]  ;; edi = dest
-;;
-;; pixelcount = yh - yl + 1
-;;
-        mov     ecx,[dc_yh]
-        add     ecx,1
-        sub     ecx,ebp                 ;; pixel count
-        jle     near .done              ;; nothing to scale
-;;
-;; fracstep = dc_iscale;	// But we just use [dc_iscale]
-;; frac = (dc_texturemid + FixedMul((dc_yl << FRACBITS) - centeryfrac, fracstep));
-;;
-        mov     eax,ebp                 ;; dc_yl
-        shl     eax,FRACBITS
-        sub     eax,[centeryfrac]
-        imul    dword [dc_iscale]
-        shrd    eax,edx,FRACBITS
-        add     eax,[dc_texturemid]
-        mov     ebp,eax                 ;; ebp = frac
-
-        mov     ebx,[dc_colormap]
-
-        mov     esi,[dc_source]
-;;
-;; if (dc_hires) frac = 0;
-;;
-        test    byte [dc_hires],0x01
-        jz      .texheightcheck
-        xor     ebp,ebp
-
-;;
-;; Check for power of two
-;;
-.texheightcheck:
-        mov     edx,[dc_texheight]
-        sub     edx,1                   ;; edx = heightmask
-        test    edx,[dc_texheight]
-        jnz     .notpowertwo
-
-        test    ecx,0x01                ;; Test for odd no. pixels
-        jnz     .odd
-
-;;
-;; Texture height is a power of two, so we get modular arithmetic by
-;; masking
-;;
-.powertwo:
-        mov     eax,ebp                 ;; eax = frac
-        sar     eax,FRACBITS            ;; Integer part
-        and     eax,edx                 ;; eax &= heightmask
-        movzx   eax,byte [esi + eax]    ;; eax = texel
-        add     ebp,[dc_iscale]         ;; frac += fracstep
-        cmp     al,TRANSPARENTPIXEL     ;; Is pixel transparent?
-        je      .nextpowtwoeven         ;; If so, advance.
-        movzx   eax,byte [ebx+eax]      ;; Map through colormap
-        mov	    [edi],al                ;; Write pixel
-.nextpowtwoeven:
-                                        ;; dest += vid.width
-        add     edi,[vid + viddef_s.width]
-
-.odd:
-        mov     eax,ebp                 ;; eax = frac
-        sar     eax,FRACBITS            ;; Integer part
-        and     eax,edx                 ;; eax &= heightmask
-        movzx   eax,byte [esi + eax]    ;; eax = texel
-        add     ebp,[dc_iscale]         ;; frac += fracstep
-        cmp     al,TRANSPARENTPIXEL     ;; Is pixel transparent?
-        je      .nextpowtwoodd          ;; If so, advance.
-        movzx   eax,byte [ebx+eax]      ;; Map through colormap
-        mov     [edi],al                ;; Write pixel
-.nextpowtwoodd:
-                                        ;; dest += vid.width
-        add     edi,[vid + viddef_s.width]
-
-
-        sub     ecx,2                   ;; count -= 2
-        jg      .powertwo
-
-        jmp     .done
-
-.notpowertwo:
-        add     edx,1
-        shl     edx,FRACBITS
-        test    ebp,ebp
-        jns     .notpowtwoloop
-
-.makefracpos:
-        add     ebp,edx                 ;; frac is negative; make it positive
-        js      .makefracpos
-
-.notpowtwoloop:
-        cmp     ebp,edx                 ;; Reduce mod height
-        jl      .writenonpowtwo
-        sub     ebp,edx
-        jmp     .notpowtwoloop
-
-.writenonpowtwo:
-        mov     eax,ebp                 ;; eax = frac
-        sar     eax,FRACBITS            ;; Integer part.
-        mov     bl,[esi + eax]          ;; ebx = colormap + texel
-        add     ebp,[dc_iscale]         ;; frac += fracstep
-        cmp     bl,TRANSPARENTPIXEL     ;; Is pixel transparent?
-        je      .nextnonpowtwo          ;; If so, advance.
-        movzx   eax,byte [ebx]          ;; Map through colormap
-        mov     [edi],al                ;; Write pixel
-.nextnonpowtwo:
-                                        ;; dest += vid.width
-        add     edi,[vid + viddef_s.width]
-
-        sub     ecx,1
-        jnz     .notpowtwoloop
-
-;;
-
-.done:
-        pop     ebx                     ;; restore register variables
-        pop     edi
-        pop     esi
-        pop     ebp                     ;; restore caller's stack frame pointer
-        ret
-
-;;----------------------------------------------------------------------
-;; R_DrawTranslucentColumnA_8
-;;
-;; Vertical column texture drawer, with transparency. Replaces Doom2's
-;; 'fuzz' effect, which was not so beautiful.
-;; Transparency is always impressive in some way, don't know why...
-;;----------------------------------------------------------------------
-
-cglobal R_DrawTranslucentColumn_8_ASM
-R_DrawTranslucentColumn_8_ASM:
-        push    ebp                     ;; preserve caller's stack frame pointer
-        push    esi                     ;; preserve register variables
-        push    edi
-        push    ebx
-;;
-;; dest = ylookup[dc_yl] + columnofs[dc_x];
-;;
-        mov     ebp,[dc_yl]
-        mov     ebx,ebp
-        mov     edi,[ylookup+ebx*4]
-        mov     ebx,[dc_x]
-        add     edi,[columnofs+ebx*4]   ;; edi = dest
-;;
-;; pixelcount = yh - yl + 1
-;;
-        mov     eax,[dc_yh]
-        inc     eax
-        sub     eax,ebp                 ;; pixel count
-        mov     [pixelcount],eax        ;; save for final pixel
-        jle     near    vtdone         ;; nothing to scale
-;;
-;; frac = dc_texturemid - (centery-dc_yl)*fracstep;
-;;
-        mov     ecx,[dc_iscale]        ;; fracstep
-        mov     eax,[centery]
-        sub     eax,ebp
-        imul    eax,ecx
-        mov     edx,[dc_texturemid]
-        sub     edx,eax
-        mov     ebx,edx
-
-        shr     ebx,16                  ;; frac int.
-        and     ebx,0x7f
-        shl     edx,16                  ;; y frac up
-
-        mov     ebp,ecx
-        shl     ebp,16                  ;; fracstep f. up
-        shr     ecx,16                  ;; fracstep i. ->cl
-        and     cl,0x7f
-        push    cx
-        mov     ecx,edx
-        pop     cx
-        mov     edx,[dc_colormap]
-        mov     esi,[dc_source]
-;;
-;; lets rock :) !
-;;
-        mov     eax,[pixelcount]
-        shr     eax,0x2
-        test    byte [pixelcount],0x3
-        mov     ch,al                   ;; quad count
-        mov     eax,[dc_transmap]
-        je      vt4quadloop
-;;
-;;  do un-even pixel
-;;
-        test    byte [pixelcount],0x1
-        je      trf2
-
-        mov     ah,[esi+ebx]            ;; fetch texel : colormap number
-        add     ecx,ebp
-        adc     bl,cl
-        mov     al,[edi]                ;; fetch dest  : index into colormap
-        and     bl,0x7f
-        mov     dl,[eax]
-        mov     dl,[edx]
-        mov     [edi],dl
-pf:     add     edi,0x12345678
-;;
-;;  do two non-quad-aligned pixels
-;;
-trf2:    test    byte [pixelcount],0x2
-        je      trf3
-
-        mov     ah,[esi+ebx]            ;; fetch texel : colormap number
-        add     ecx,ebp
-        adc     bl,cl
-        mov     al,[edi]                ;; fetch dest  : index into colormap
-        and     bl,0x7f
-        mov     dl,[eax]
-        mov     dl,[edx]
-        mov     [edi],dl
-pg:     add     edi,0x12345678
-
-        mov     ah,[esi+ebx]            ;; fetch texel : colormap number
-        add     ecx,ebp
-        adc     bl,cl
-        mov     al,[edi]                ;; fetch dest  : index into colormap
-        and     bl,0x7f
-        mov     dl,[eax]
-        mov     dl,[edx]
-        mov     [edi],dl
-ph:     add     edi,0x12345678
-;;
-;;  test if there was at least 4 pixels
-;;
-trf3:   test    ch,0xff                 ;; test quad count
-        je near vtdone
-
-;;
-;; ebp : ystep frac. upper 24 bits
-;; edx : y     frac. upper 24 bits
-;; ebx : y     i.    lower 7 bits,  masked for index
-;; ecx : ch = counter, cl = y step i.
-;; eax : colormap aligned 256
-;; esi : source texture column
-;; edi : dest screen
-;;
-vt4quadloop:
-        mov     ah,[esi+ebx]            ;; fetch texel : colormap number
-        mov     [tystep],ebp
-pi:     add     edi,0x12345678
-        mov     al,[edi]                ;; fetch dest  : index into colormap
-pj:     sub     edi,0x12345678
-        mov     ebp,edi
-pk:     sub     edi,0x12345678
-        jmp short inloop
-align 4
-vtquadloop:
-        add     ecx,[tystep]
-        adc     bl,cl
-q1:     add     ebp,0x23456789
-        and     bl,0x7f
-        mov     dl,[eax]
-        mov     ah,[esi+ebx]            ;; fetch texel : colormap number
-        mov     dl,[edx]
-        mov     [edi],dl
-        mov     al,[ebp]                ;; fetch dest   : index into colormap
-inloop:
-        add     ecx,[tystep]
-        adc     bl,cl
-q2:     add     edi,0x23456789
-        and     bl,0x7f
-        mov     dl,[eax]
-        mov     ah,[esi+ebx]            ;; fetch texel : colormap number
-        mov     dl,[edx]
-        mov     [ebp+0x0],dl
-        mov     al,[edi]                ;; fetch dest   : index into colormap
-
-        add     ecx,[tystep]
-        adc     bl,cl
-q3:     add     ebp,0x23456789
-        and     bl,0x7f
-        mov     dl,[eax]
-        mov     ah,[esi+ebx]            ;; fetch texel : colormap number
-        mov     dl,[edx]
-        mov     [edi],dl
-        mov     al,[ebp]                ;; fetch dest   : index into colormap
-
-        add     ecx,[tystep]
-        adc     bl,cl
-q4:     add     edi,0x23456789
-        and     bl,0x7f
-        mov     dl,[eax]
-        mov     ah,[esi+ebx]            ;; fetch texel : colormap number
-        mov     dl,[edx]
-        mov     [ebp],dl
-        mov     al,[edi]                ;; fetch dest   : index into colormap
-
-        dec     ch
-        jne     vtquadloop
-vtdone:
-        pop     ebx
-        pop     edi
-        pop     esi
-        pop     ebp
-        ret
-
-;;----------------------------------------------------------------------
-;; R_DrawShadeColumn
-;;
-;;   for smoke..etc.. test.
-;;----------------------------------------------------------------------
-cglobal R_DrawShadeColumn_8_ASM
-R_DrawShadeColumn_8_ASM:
-        push    ebp                     ;; preserve caller's stack frame pointer
-        push    esi                     ;; preserve register variables
-        push    edi
-        push    ebx
-
-;;
-;; dest = ylookup[dc_yl] + columnofs[dc_x];
-;;
-        mov     ebp,[dc_yl]
-        mov     ebx,ebp
-        mov     edi,[ylookup+ebx*4]
-        mov     ebx,[dc_x]
-        add     edi,[columnofs+ebx*4]  ;; edi = dest
-;;
-;; pixelcount = yh - yl + 1
-;;
-        mov     eax,[dc_yh]
-        inc     eax
-        sub     eax,ebp                 ;; pixel count
-        mov     [pixelcount],eax       ;; save for final pixel
-        jle near shdone                ;; nothing to scale
-;;
-;; frac = dc_texturemid - (centery-dc_yl)*fracstep;
-;;
-        mov     ecx,[dc_iscale]        ;; fracstep
-        mov     eax,[centery]
-        sub     eax,ebp
-        imul    eax,ecx
-        mov     edx,[dc_texturemid]
-        sub     edx,eax
-        mov     ebx,edx
-        shr     ebx,16                  ;; frac int.
-        and     ebx,byte +0x7f
-        shl     edx,16                  ;; y frac up
-
-        mov     ebp,ecx
-        shl     ebp,16                  ;; fracstep f. up
-        shr     ecx,16                  ;; fracstep i. ->cl
-        and     cl,0x7f
-
-        mov     esi,[dc_source]
-;;
-;; lets rock :) !
-;;
-        mov     eax,[pixelcount]
-        mov     dh,al
-        shr     eax,2
-        mov     ch,al                   ;; quad count
-        mov     eax,[colormaps]
-        test    dh,3
-        je      sh4quadloop
-;;
-;;  do un-even pixel
-;;
-        test    dh,0x1
-        je      shf2
-
-        mov     ah,[esi+ebx]            ;; fetch texel : colormap number
-        add     edx,ebp
-        adc     bl,cl
-        mov     al,[edi]                ;; fetch dest  : index into colormap
-        and     bl,0x7f
-        mov     dl,[eax]
-        mov     [edi],dl
-pl:     add     edi,0x12345678
-;;
-;;  do two non-quad-aligned pixels
-;;
-shf2:
-        test    dh,0x2
-        je      shf3
-
-        mov     ah,[esi+ebx]            ;; fetch texel : colormap number
-        add     edx,ebp
-        adc     bl,cl
-        mov     al,[edi]                ;; fetch dest  : index into colormap
-        and     bl,0x7f
-        mov     dl,[eax]
-        mov     [edi],dl
-pm:     add     edi,0x12345678
-
-        mov     ah,[esi+ebx]            ;; fetch texel : colormap number
-        add     edx,ebp
-        adc     bl,cl
-        mov     al,[edi]                ;; fetch dest  : index into colormap
-        and     bl,0x7f
-        mov     dl,[eax]
-        mov     [edi],dl
-pn:     add     edi,0x12345678
-;;
-;;  test if there was at least 4 pixels
-;;
-shf3:
-        test    ch,0xff                 ;; test quad count
-        je near shdone
-
-;;
-;; ebp : ystep frac. upper 24 bits
-;; edx : y     frac. upper 24 bits
-;; ebx : y     i.    lower 7 bits,  masked for index
-;; ecx : ch = counter, cl = y step i.
-;; eax : colormap aligned 256
-;; esi : source texture column
-;; edi : dest screen
-;;
-sh4quadloop:
-        mov     dh,0x7f                 ;; prep mask
-        mov     ah,[esi+ebx]            ;; fetch texel : colormap number
-        mov     [tystep],ebp
-po:     add     edi,0x12345678
-        mov     al,[edi]                ;; fetch dest  : index into colormap
-pp:     sub     edi,0x12345678
-        mov     ebp,edi
-pq:     sub     edi,0x12345678
-        jmp short shinloop
-
-align  4
-shquadloop:
-        add     edx,[tystep]
-        adc     bl,cl
-        and     bl,dh
-q5:     add     ebp,0x12345678
-        mov     dl,[eax]
-        mov     ah,[esi+ebx]            ;; fetch texel : colormap number
-        mov     [edi],dl
-        mov     al,[ebp]                ;; fetch dest : index into colormap
-shinloop:
-        add     edx,[tystep]
-        adc     bl,cl
-        and     bl,dh
-q6:     add     edi,0x12345678
-        mov     dl,[eax]
-        mov     ah,[esi+ebx]            ;; fetch texel : colormap number
-        mov     [ebp],dl
-        mov     al,[edi]                ;; fetch dest : index into colormap
-
-        add     edx,[tystep]
-        adc     bl,cl
-        and     bl,dh
-q7:     add     ebp,0x12345678
-        mov     dl,[eax]
-        mov     ah,[esi+ebx]            ;; fetch texel : colormap number
-        mov     [edi],dl
-        mov     al,[ebp]                ;; fetch dest : index into colormap
-
-        add     edx,[tystep]
-        adc     bl,cl
-        and     bl,dh
-q8:     add     edi,0x12345678
-        mov     dl,[eax]
-        mov     ah,[esi+ebx]            ;; fetch texel : colormap number
-        mov     [ebp],dl
-        mov     al,[edi]                ;; fetch dest : index into colormap
-
-        dec     ch
-        jne     shquadloop
-
-shdone:
-        pop     ebx                     ;; restore register variables
-        pop     edi
-        pop     esi
-        pop     ebp                     ;; restore caller's stack frame pointer
-        ret
-
-
-;; ========================================================================
-;;  Rasterization of the segments of a LINEAR polygne textur of manire.
-;;  It is thus a question of interpolating coordinate them at the edges of texture in
-;;  the time that the X-coordinates minx/maxx for each line.
-;;  the argument ' dir' indicates which edges of texture are Interpol?:
-;;    0:  segments associs at edge TOP? and BOTTOM? (constant TY)
-;;    1:  segments associs at the LEFT and RIGHT edge (constant TX)
-;; ========================================================================
-;;
-;;  void   rasterize_segment_tex( LONG x1, LONG y1, LONG x2, LONG y2, LONG tv1, LONG tv2, LONG tc, LONG dir );
-;;                                   ARG1     ARG2     ARG3     ARG4      ARG5      ARG6     ARG7       ARG8
-;;
-;;  Pour dir = 0, (tv1,tv2) = (tX1,tX2), tc = tY, en effet TY est constant.
-;;
-;;  Pour dir = 1, (tv1,tv2) = (tY1,tY2), tc = tX, en effet TX est constant.
-;;
-;;
-;;  Uses:  extern struct rastery *_rastertab;
-;;
-
-MINX            EQU    0
-MAXX            EQU    4
-TX1             EQU    8
-TY1             EQU    12
-TX2             EQU    16
-TY2             EQU    20
-RASTERY_SIZEOF  EQU    24
-
-cglobal rasterize_segment_tex_asm
-rasterize_segment_tex_asm:
-        push    ebp
-        mov     ebp,esp
-
-        sub     esp,byte +0x8           ;; allocate the local variables
-
-        push    ebx
-        push    esi
-        push    edi
-        o16 mov ax,es
-        push    eax
-
-;;        #define DX       [ebp-4]
-;;        #define TD       [ebp-8]
-
-        mov     eax,[ebp+0xc]           ;; y1
-        mov     ebx,[ebp+0x14]          ;; y2
-        cmp     ebx,eax
-        je near .L_finished             ;; special (y1==y2) segment horizontal, exit!
-
-        jg near .L_rasterize_right
-
-;;rasterize_left:       ;; one rasterize a segment LEFT of the polygne
-
-        mov     ecx,eax
-        sub     ecx,ebx
-        inc     ecx                     ;; y1-y2+1
-
-        mov     eax,RASTERY_SIZEOF
-        mul     ebx                     ;; * y2
-        mov     esi,[prastertab]
-        add     esi,eax                 ;; point into rastertab[y2]
-
-        mov     eax,[ebp+0x8]           ;; ARG1
-        sub     eax,[ebp+0x10]          ;; ARG3
-        shl     eax,0x10                ;;     ((x1-x2)<<PRE) ...
-        cdq
-        idiv    ecx                     ;; dx =     ...        / (y1-y2+1)
-        mov     [ebp-0x4],eax           ;; DX
-
-        mov     eax,[ebp+0x18]          ;; ARG5
-        sub     eax,[ebp+0x1c]          ;; ARG6
-        shl     eax,0x10
-        cdq
-        idiv    ecx                     ;;      tdx =((tx1-tx2)<<PRE) / (y1-y2+1)
-        mov     [ebp-0x8],eax           ;; idem tdy =((ty1-ty2)<<PRE) / (y1-y2+1)
-
-        mov     eax,[ebp+0x10]          ;; ARG3
-        shl     eax,0x10                ;; x = x2<<PRE
-
-        mov     ebx,[ebp+0x1c]          ;; ARG6
-        shl     ebx,0x10                ;; tx = tx2<<PRE    d0
-                                        ;; ty = ty2<<PRE    d1
-        mov     edx,[ebp+0x20]          ;; ARG7
-        shl     edx,0x10                ;; ty = ty<<PRE     d0
-                                        ;; tx = tx<<PRE     d1
-        push    ebp
-        mov     edi,[ebp-0x4]           ;; DX
-        cmp     dword [ebp+0x24],byte +0x0      ;; ARG8   direction ?
-
-        mov     ebp,[ebp-0x8]           ;; TD
-        je      .L_rleft_h_loop
-;;
-;; TY varies, TX is constant
-;;
-.L_rleft_v_loop:
-        mov     [esi+MINX],eax           ;; rastertab[y].minx = x
-          add     ebx,ebp
-        mov     [esi+TX1],edx           ;;             .tx1  = tx
-          add     eax,edi
-        mov     [esi+TY1],ebx           ;;             .ty1  = ty
-
-        ;;addl    DX, %eax        // x     += dx
-        ;;addl    TD, %ebx        // ty    += tdy
-
-        add     esi,RASTERY_SIZEOF      ;; next raster line into rastertab[]
-        dec     ecx
-        jne     .L_rleft_v_loop
-        pop     ebp
-        jmp     .L_finished
-;;
-;; TX varies, TY is constant
-;;
-.L_rleft_h_loop:
-        mov     [esi+MINX],eax           ;; rastertab[y].minx = x
-          add     eax,edi
-        mov     [esi+TX1],ebx           ;;             .tx1  = tx
-          add     ebx,ebp
-        mov     [esi+TY1],edx           ;;             .ty1  = ty
-
-        ;;addl    DX, %eax        // x     += dx
-        ;;addl    TD, %ebx        // tx    += tdx
-
-        add     esi,RASTERY_SIZEOF      ;; next raster line into rastertab[]
-        dec     ecx
-        jne     .L_rleft_h_loop
-        pop     ebp
-        jmp     .L_finished
-;;
-;; one rasterize a segment LINE of the polygne
-;;
-.L_rasterize_right:
-        mov     ecx,ebx
-        sub     ecx,eax
-        inc     ecx                     ;; y2-y1+1
-
-        mov     ebx,RASTERY_SIZEOF
-        mul     ebx                     ;;   * y1
-        mov     esi,[prastertab]
-        add     esi,eax                 ;;  point into rastertab[y1]
-
-        mov     eax,[ebp+0x10]          ;; ARG3
-        sub     eax,[ebp+0x8]           ;; ARG1
-        shl     eax,0x10                ;; ((x2-x1)<<PRE) ...
-        cdq
-        idiv    ecx                     ;;  dx =     ...        / (y2-y1+1)
-        mov     [ebp-0x4],eax           ;; DX
-
-        mov     eax,[ebp+0x1c]          ;; ARG6
-        sub     eax,[ebp+0x18]          ;; ARG5
-        shl     eax,0x10
-        cdq
-        idiv    ecx                     ;;       tdx =((tx2-tx1)<<PRE) / (y2-y1+1)
-        mov     [ebp-0x8],eax           ;;  idem tdy =((ty2-ty1)<<PRE) / (y2-y1+1)
-
-        mov     eax,[ebp+0x8]           ;; ARG1
-        shl     eax,0x10                ;; x  = x1<<PRE
-
-        mov     ebx,[ebp+0x18]          ;; ARG5
-        shl     ebx,0x10                ;; tx = tx1<<PRE    d0
-                                        ;; ty = ty1<<PRE    d1
-        mov     edx,[ebp+0x20]          ;; ARG7
-        shl     edx,0x10                ;; ty = ty<<PRE     d0
-                                        ;; tx = tx<<PRE     d1
-        push    ebp
-        mov     edi,[ebp-0x4]           ;; DX
-
-        cmp     dword [ebp+0x24], 0     ;; direction ?
-
-         mov     ebp,[ebp-0x8]          ;; TD
-        je      .L_rright_h_loop
-;;
-;; TY varies, TX is constant
-;;
-.L_rright_v_loop:
-
-        mov     [esi+MAXX],eax           ;; rastertab[y].maxx = x
-          add     ebx,ebp
-        mov     [esi+TX2],edx          ;;             .tx2  = tx
-          add     eax,edi
-        mov     [esi+TY2],ebx          ;;             .ty2  = ty
-
-        ;;addl    DX, %eax        // x     += dx
-        ;;addl    TD, %ebx        // ty    += tdy
-
-        add     esi,RASTERY_SIZEOF
-        dec     ecx
-        jne     .L_rright_v_loop
-
-        pop     ebp
-
-        jmp     short .L_finished
-;;
-;; TX varies, TY is constant
-;;
-.L_rright_h_loop:
-        mov     [esi+MAXX],eax           ;; rastertab[y].maxx = x
-          add     eax,edi
-        mov     [esi+TX2],ebx          ;;             .tx2  = tx
-          add     ebx,ebp
-        mov     [esi+TY2],edx          ;;             .ty2  = ty
-
-        ;;addl    DX, %eax        // x     += dx
-        ;;addl    TD, %ebx        // tx    += tdx
-
-        add     esi,RASTERY_SIZEOF
-        dec     ecx
-        jne     .L_rright_h_loop
-
-        pop     ebp
-
-.L_finished:
-        pop     eax
-        o16 mov es,ax
-        pop     edi
-        pop     esi
-        pop     ebx
-
-        mov     esp,ebp
-        pop     ebp
-        ret
--- a/src/tmap.s
+++ b/src/tmap.s
-// SONIC ROBO BLAST 2
-//-----------------------------------------------------------------------------
-// Copyright (C) 1998-2000 by DooM Legacy Team.
-// Copyright (C) 1999-2021 by Sonic Team Junior.
-//
-// This program is free software distributed under the
-// terms of the GNU General Public License, version 2.
-// See the 'LICENSE' file for more details.
-//-----------------------------------------------------------------------------
-/// \file  tmap.s
-/// \brief optimised drawing routines for span/column rendering
-
-// structures, must match the C structures!
-#include "asm_defs.inc"
-
-// Rappel: seuls EAX, ECX, EDX peuvent �tre �cras�s librement.
-//         il faut sauver esi,edi, cd...gs
-
-/* Attention aux comparaisons!                                              */
-/*                                                                          */
-/*      Intel_compare:                                                      */
-/*                                                                          */
-/*              cmp     A,B                     // A-B , set flags          */
-/*              jg      A_greater_than_B                                    */
-/*                                                                          */
-/*      AT&T_compare:                                                       */
-/*                                                                          */
-/*              cmp     A,B                     // B-A , set flags          */
-/*              jg      B_greater_than_A                                    */
-/*                                                                          */
-/*        (soustrait l'op�rande source DE l'op�rande destination,           */
-/*         comme sur Motorola! )                                            */
-
-// RAPPEL: Intel
-//         SECTION:[BASE+INDEX*SCALE+DISP]
-// devient SECTION:DISP(BASE,INDEX,SCALE)
-
-//----------------------------------------------------------------------
-//
-// R_DrawColumn
-//
-//   New optimised version 10-01-1998 by D.Fabrice and P.Boris
-//   TO DO: optimise it much farther... should take at most 3 cycles/pix
-//          once it's fixed, add code to patch the offsets so that it
-//          works in every screen width.
-//
-//----------------------------------------------------------------------
-
-    .data
-#ifdef LINUX
-    .align 2
-#else
-    .align 4
-#endif
-C(loopcount):   .long   0
-C(pixelcount):  .long   0
-C(tystep):      .long   0
-
-C(vidwidth):    .long   0       //use this one out of the inner loops
-                                //so you don't need to patch everywhere...
-
-#ifdef USEASM
-#if !defined( LINUX)
-    .text
-#endif
-.globl C(ASM_PatchRowBytes)
-C(ASM_PatchRowBytes):
-    pushl   %ebp
-    movl    %esp, %ebp      // assure l'"adressabilit� du stack"
-
-    movl    ARG1, %edx         // read first arg
-    movl    %edx, C(vidwidth)
-
-    // 1 * vidwidth
-    movl    %edx,p1+2
-    movl    %edx,w1+2   //water
-    movl    %edx,p1b+2  //sky
-
-    movl    %edx,p5+2
-      movl    %edx,sh5+2        //smokie test
-
-    // 2 * vidwidth
-    addl    ARG1,%edx
-
-    movl    %edx,p2+2
-    movl    %edx,w2+2   //water
-    movl    %edx,p2b+2  //sky
-
-    movl    %edx,p6+2
-    movl    %edx,p7+2
-    movl    %edx,p8+2
-    movl    %edx,p9+2
-      movl    %edx,sh6+2         //smokie test
-      movl    %edx,sh7+2
-      movl    %edx,sh8+2
-      movl    %edx,sh9+2
-
-    // 3 * vidwidth
-    addl    ARG1,%edx
-
-    movl    %edx,p3+2
-    movl    %edx,w3+2   //water
-    movl    %edx,p3b+2  //sky
-
-    // 4 * vidwidth
-    addl    ARG1,%edx
-
-    movl    %edx,p4+2
-    movl    %edx,w4+2   //water
-    movl    %edx,p4b+2  //sky
-
-    popl    %ebp
-    ret
-
-
-#ifdef LINUX
-    .align 2
-#else
-    .align 5
-#endif
-.globl C(R_DrawColumn_8)
-C(R_DrawColumn_8):
-    pushl   %ebp                // preserve caller's stack frame pointer
-    pushl   %esi                // preserve register variables
-    pushl   %edi
-    pushl   %ebx
-
-//
-// dest = ylookup[dc_yl] + columnofs[dc_x];
-//
-    movl     C(dc_yl),%ebp
-    movl     %ebp,%ebx
-    movl     C(ylookup)(,%ebx,4),%edi
-    movl     C(dc_x),%ebx
-    addl     C(columnofs)(,%ebx,4),%edi  // edi = dest
-
-//
-// pixelcount = yh - yl + 1
-//
-    movl     C(dc_yh),%eax
-    incl     %eax
-    subl     %ebp,%eax                   // pixel count
-    movl     %eax,C(pixelcount)          // save for final pixel
-    jle      vdone                       // nothing to scale
-
-//
-// frac = dc_texturemid - (centery-dc_yl)*fracstep;
-//
-    movl     C(dc_iscale),%ecx           // fracstep
-    movl     C(centery),%eax
-    subl     %ebp,%eax
-    imul     %ecx,%eax
-    movl     C(dc_texturemid),%edx
-    subl     %eax,%edx
-     movl     %edx,%ebx
-     shrl     $16,%ebx          // frac int.
-     andl     $0x0000007f,%ebx
-     shll     $16,%edx          // y frac up
-
-     movl     %ecx,%ebp
-     shll     $16,%ebp          // fracstep f. up
-     shrl     $16,%ecx          // fracstep i. ->cl
-     andb     $0x7f,%cl
-
-    movl     C(dc_source),%esi
-
-//
-// lets rock :) !
-//
-    movl    C(pixelcount),%eax
-    movb    %al,%dh
-    shrl    $2,%eax
-    movb    %al,%ch             // quad count
-    movl    C(dc_colormap),%eax
-    testb   $3,%dh
-    jz      v4quadloop
-
-//
-//  do un-even pixel
-//
-    testb   $1,%dh
-    jz      2f
-
-    movb    (%esi,%ebx),%al     // prep un-even loops
-     addl    %ebp,%edx            // ypos f += ystep f
-    adcb    %cl,%bl              // ypos i += ystep i
-     movb    (%eax),%dl           // colormap texel
-    andb    $0x7f,%bl            // mask 0-127 texture index
-     movb    %dl,(%edi)           // output pixel
-    addl    C(vidwidth),%edi
-
-//
-//  do two non-quad-aligned pixels
-//
-2:
-    testb   $2,%dh
-    jz      3f
-
-    movb    (%esi,%ebx),%al      // fetch source texel
-     addl    %ebp,%edx            // ypos f += ystep f
-    adcb    %cl,%bl              // ypos i += ystep i
-     movb    (%eax),%dl           // colormap texel
-    andb    $0x7f,%bl            // mask 0-127 texture index
-     movb    %dl,(%edi)           // output pixel
-
-    movb    (%esi,%ebx),%al      // fetch source texel
-     addl    %ebp,%edx            // ypos f += ystep f
-    adcb    %cl,%bl              // ypos i += ystep i
-     movb    (%eax),%dl           // colormap texel
-    andb    $0x7f,%bl            // mask 0-127 texture index
-    addl    C(vidwidth),%edi
-     movb    %dl,(%edi)           // output pixel
-
-    addl    C(vidwidth),%edi
-
-//
-//  test if there was at least 4 pixels
-//
-3:
-    testb   $0xFF,%ch           // test quad count
-    jz      vdone
-
-//
-// ebp : ystep frac. upper 24 bits
-// edx : y     frac. upper 24 bits
-// ebx : y     i.    lower 7 bits,  masked for index
-// ecx : ch = counter, cl = y step i.
-// eax : colormap aligned 256
-// esi : source texture column
-// edi : dest screen
-//
-v4quadloop:
-    movb    $0x7f,%dh           // prep mask
-//    .align  4
-vquadloop:
-    movb    (%esi,%ebx),%al     // prep loop
-     addl    %ebp,%edx            // ypos f += ystep f
-    adcb    %cl,%bl              // ypos i += ystep i
-     movb    (%eax),%dl           // colormap texel
-    movb    %dl,(%edi)           // output pixel
-     andb    $0x7f,%bl            // mask 0-127 texture index
-
-    movb    (%esi,%ebx),%al      // fetch source texel
-     addl    %ebp,%edx
-    adcb    %cl,%bl
-     movb    (%eax),%dl
-p1:    movb    %dl,0x12345678(%edi)
-     andb    $0x7f,%bl
-
-    movb    (%esi,%ebx),%al      // fetch source texel
-     addl    %ebp,%edx
-    adcb    %cl,%bl
-     movb    (%eax),%dl
-p2:    movb    %dl,2*0x12345678(%edi)
-     andb    $0x7f,%bl
-
-    movb    (%esi,%ebx),%al      // fetch source texel
-     addl    %ebp,%edx
-    adcb    %cl,%bl
-     movb    (%eax),%dl
-p3:    movb    %dl,3*0x12345678(%edi)
-     andb    $0x7f,%bl
-
-p4:    addl    $4*0x12345678,%edi
-
-    decb   %ch
-     jnz    vquadloop
-
-vdone:
-    popl    %ebx                // restore register variables
-    popl    %edi
-    popl    %esi
-    popl    %ebp                // restore caller's stack frame pointer
-    ret
-
-#ifdef HORIZONTALDRAW
-// --------------------------------------------------------------------------
-// Horizontal Column Drawer Optimisation
-// --------------------------------------------------------------------------
-
-#ifdef LINUX
-    .align 2
-#else
-    .align 5
-#endif
-.globl C(R_DrawHColumn_8)
-C(R_DrawHColumn_8):
-    pushl   %ebp
-    pushl   %esi
-    pushl   %edi
-    pushl   %ebx
-
-//
-// dest = yhlookup[dc_x] + hcolumnofs[dc_yl];
-//
-    movl    C(dc_x),%ebx
-    movl    C(yhlookup)(,%ebx,4),%edi
-    movl    C(dc_yl),%ebp
-    movl    %ebp,%ebx
-    addl    C(hcolumnofs)(,%ebx,4),%edi  // edi = dest
-
-//
-// pixelcount = yh - yl + 1
-//
-    movl     C(dc_yh),%eax
-    incl     %eax
-    subl     %ebp,%eax                   // pixel count
-    movl     %eax,C(pixelcount)          // save for final pixel
-    jle      vhdone                      // nothing to scale
-
-//
-// frac = dc_texturemid - (centery-dc_yl)*fracstep;
-//
-    movl     C(dc_iscale),%ecx           // fracstep
-    movl     C(centery),%eax
-    subl     %ebp,%eax
-    imul     %ecx,%eax
-    movl     C(dc_texturemid),%edx
-    subl     %eax,%edx
-     movl     %edx,%ebx
-     shrl     $16,%ebx          // frac int.
-     andl     $0x0000007f,%ebx
-     shll     $16,%edx          // y frac up
-
-     movl     %ecx,%ebp
-     shll     $16,%ebp          // fracstep f. up
-     shrl     $16,%ecx          // fracstep i. ->cl
-     andb     $0x7f,%cl
-
-    movl     C(dc_source),%esi
-
-//
-// lets rock :) !
-//
-
-    movl    C(pixelcount),%eax
-    movb    %al,%dh
-    shrl    $2,%eax
-    movb    %al,%ch     // quad count
-
-    testb   %ch, %ch
-    jz      vhnearlydone
-
-    movl    C(dc_colormap),%eax
-    decl    %edi                  //-----
-
-vhloop:
-    movb    (%esi,%ebx),%al      // fetch source texel
-     addl    %ebp,%edx
-    adcb    %cl,%bl
-     andb    $0x7f,%bl
-    incl    %edi                 //-----
-     movb    (%eax),%dh
-    movb    %dh,(%edi)           //-----
-
-     movb    (%esi,%ebx),%al      // fetch source texel
-    addl    %ebp,%edx
-     incl    %edi                //-----
-    adcb    %cl,%bl
-     movb    (%eax),%dl
-    andb    $0x7f,%bl
-     movb    %dl,(%edi)          //-----
-
-    movb    (%esi,%ebx),%al      // fetch source texel
-     addl    %ebp,%edx
-    adcb    %cl,%bl
-//    shll    $16,%edx
-     andb    $0x7f,%bl
-    incl    %edi                //-----
-     movb    (%eax),%dh
-    movb    %dh,(%edi)          //-----
-
-     movb    (%esi,%ebx),%al      // fetch source texel
-    addl    %ebp,%edx
-     incl    %edi               //-----
-    adcb    %cl,%bl
-     movb    (%eax),%dl
-    andb    $0x7f,%bl
-     movb    %dl,(%edi)
-//     movl    %edx,(%edi)
-//    addl    $4,%edi
-
-    decb   %ch
-     jnz    vhloop
-
-vhnearlydone:
-//    movl    C(pixelcount)
-
-vhdone:
-    popl    %ebx
-    popl    %edi
-    popl    %esi
-    popl    %ebp
-    ret
-
-
-// --------------------------------------------------------------------------
-// Rotate a buffer 90 degree in clockwise order after horiz.col. draws
-// --------------------------------------------------------------------------
-
-#ifdef LINUX
-    .align 2
-#else
-    .align 5
-#endif
-.globl C(R_RotateBuffer)
-C(R_RotateBuffer):
-    pushl   %ebp
-    pushl   %esi
-    pushl   %edi
-    pushl   %ebx
-
-
-    movl    C(dc_source),%esi
-    movl    C(dc_colormap),%edi
-
-
-    movb    (%esi),%ah
-     addl    $200,%esi
-    movb    (%ebx),%al
-     addl    $200,%ebx
-    bswap    %eax
-    movb    (%esi),%ah
-     addl    $200,%esi
-    movb    (%ebx),%al
-     addl    $200,%ebx
-    movl    %eax,(%edi)
-     addl    $4,%edi
-
-
-    popl    %ebx
-    popl    %edi
-    popl    %esi
-    popl    %ebp
-    ret
-#endif
-
-//----------------------------------------------------------------------
-//13-02-98:
-//   R_DrawSkyColumn : same as R_DrawColumn but:
-//
-//            - wrap around 256 instead of 127.
-//   this is needed because we have a higher texture for mouselook,
-//   we need at least 200 lines for the sky.
-//
-//   NOTE: the sky should never wrap, so it could use a faster method.
-//         for the moment, we'll still use a wrapping method...
-//
-//  IT S JUST A QUICK CUT N PASTE, WAS NOT OPTIMISED AS IT SHOULD BE !!!
-//
-//----------------------------------------------------------------------
-
-#ifdef LINUX
-    .align 2
-#else
-    .align 5
-#endif
-.globl C(R_DrawSkyColumn_8)
-C(R_DrawSkyColumn_8):
-    pushl   %ebp
-    pushl   %esi
-    pushl   %edi
-    pushl   %ebx
-
-//
-// dest = ylookup[dc_yl] + columnofs[dc_x];
-//
-    movl     C(dc_yl),%ebp
-    movl     %ebp,%ebx
-    movl     C(ylookup)(,%ebx,4),%edi
-    movl     C(dc_x),%ebx
-    addl     C(columnofs)(,%ebx,4),%edi  // edi = dest
-
-//
-// pixelcount = yh - yl + 1
-//
-    movl     C(dc_yh),%eax
-    incl     %eax
-    subl     %ebp,%eax                   // pixel count
-    movl     %eax,C(pixelcount)          // save for final pixel
-    jle      vskydone                       // nothing to scale
-
-//
-// frac = dc_texturemid - (centery-dc_yl)*fracstep;
-//
-    movl     C(dc_iscale),%ecx           // fracstep
-    movl     C(centery),%eax
-    subl     %ebp,%eax
-    imul     %ecx,%eax
-    movl     C(dc_texturemid),%edx
-    subl     %eax,%edx
-     movl     %edx,%ebx
-     shrl     $16,%ebx          // frac int.
-     andl     $0x000000ff,%ebx
-     shll     $16,%edx          // y frac up
-
-     movl     %ecx,%ebp
-     shll     $16,%ebp          // fracstep f. up
-     shrl     $16,%ecx          // fracstep i. ->cl
-
-    movl     C(dc_source),%esi
-
-//
-// lets rock :) !
-//
-    movl    C(pixelcount),%eax
-    movb    %al,%dh
-    shrl    $2,%eax
-    movb    %al,%ch             // quad count
-    movl    C(dc_colormap),%eax
-    testb   $3,%dh
-    jz      v4skyquadloop
-
-//
-//  do un-even pixel
-//
-    testb   $1,%dh
-    jz      2f
-
-    movb    (%esi,%ebx),%al     // prep un-even loops
-     addl    %ebp,%edx            // ypos f += ystep f
-    adcb    %cl,%bl              // ypos i += ystep i
-     movb    (%eax),%dl           // colormap texel
-     movb    %dl,(%edi)           // output pixel
-    addl    C(vidwidth),%edi
-
-//
-//  do two non-quad-aligned pixels
-//
-2:
-    testb   $2,%dh
-    jz      3f
-
-    movb    (%esi,%ebx),%al      // fetch source texel
-     addl    %ebp,%edx            // ypos f += ystep f
-    adcb    %cl,%bl              // ypos i += ystep i
-     movb    (%eax),%dl           // colormap texel
-     movb    %dl,(%edi)           // output pixel
-
-    movb    (%esi,%ebx),%al      // fetch source texel
-     addl    %ebp,%edx            // ypos f += ystep f
-    adcb    %cl,%bl              // ypos i += ystep i
-     movb    (%eax),%dl           // colormap texel
-    addl    C(vidwidth),%edi
-     movb    %dl,(%edi)           // output pixel
-
-    addl    C(vidwidth),%edi
-
-//
-//  test if there was at least 4 pixels
-//
-3:
-    testb   $0xFF,%ch           // test quad count
-    jz      vskydone
-
-//
-// ebp : ystep frac. upper 24 bits
-// edx : y     frac. upper 24 bits
-// ebx : y     i.    lower 7 bits,  masked for index
-// ecx : ch = counter, cl = y step i.
-// eax : colormap aligned 256
-// esi : source texture column
-// edi : dest screen
-//
-v4skyquadloop:
-//    .align  4
-vskyquadloop:
-    movb    (%esi,%ebx),%al     // prep loop
-     addl    %ebp,%edx            // ypos f += ystep f
-    adcb    %cl,%bl              // ypos i += ystep i
-     movb    (%eax),%dl           // colormap texel
-    movb    %dl,(%edi)           // output pixel
-
-    movb    (%esi,%ebx),%al      // fetch source texel
-     addl    %ebp,%edx
-    adcb    %cl,%bl
-     movb    (%eax),%dl
-p1b:    movb    %dl,0x12345678(%edi)
-
-    movb    (%esi,%ebx),%al      // fetch source texel
-     addl    %ebp,%edx
-    adcb    %cl,%bl
-     movb    (%eax),%dl
-p2b:    movb    %dl,2*0x12345678(%edi)
-
-    movb    (%esi,%ebx),%al      // fetch source texel
-     addl    %ebp,%edx
-    adcb    %cl,%bl
-     movb    (%eax),%dl
-p3b:    movb    %dl,3*0x12345678(%edi)
-
-p4b:    addl    $4*0x12345678,%edi
-
-    decb   %ch
-     jnz    vskyquadloop
-
-vskydone:
-    popl    %ebx                // restore register variables
-    popl    %edi
-    popl    %esi
-    popl    %ebp                // restore caller's stack frame pointer
-    ret
-
-
-
-//----------------------------------------------------------------------
-//
-// R_DrawSpan
-//
-// Horizontal texture mapping
-//
-//----------------------------------------------------------------------
-
-    .data
-
-ystep:          .long   0
-xstep:          .long   0
-C(texwidth):    .long   64      // texture width
-#if !defined( LINUX)
-    .text
-#endif
-#ifdef LINUX
-    .align 2
-#else
-    .align 4
-#endif
-.globl C(R_DrawSpan_8)
-C(R_DrawSpan_8):
-    pushl   %ebp                // preserve caller's stack frame pointer
-    pushl   %esi                // preserve register variables
-    pushl   %edi
-    pushl   %ebx
-
-
-//
-// find loop count
-//
-    movl    C(ds_x2),%eax
-    incl    %eax
-    subl    C(ds_x1),%eax               // pixel count
-    movl    %eax,C(pixelcount)          // save for final pixel
-    js      hdone                       // nothing to scale
-    shrl    $1,%eax                     // double pixel count
-    movl    %eax,C(loopcount)
-
-//
-// build composite position
-//
-    movl    C(ds_xfrac),%ebp
-    shll    $10,%ebp
-    andl    $0x0ffff0000,%ebp
-    movl    C(ds_yfrac),%eax
-    shrl    $6,%eax
-    andl    $0x0ffff,%eax
-    movl    C(ds_y),%edi
-    orl     %eax,%ebp
-
-    movl    C(ds_source),%esi
-
-//
-// calculate screen dest
-//
-
-    movl    C(ylookup)(,%edi,4),%edi
-    movl    C(ds_x1),%eax
-    addl    C(columnofs)(,%eax,4),%edi
-
-//
-// build composite step
-//
-    movl    C(ds_xstep),%ebx
-    shll    $10,%ebx
-    andl    $0x0ffff0000,%ebx
-    movl    C(ds_ystep),%eax
-    shrl    $6,%eax
-    andl    $0x0ffff,%eax
-    orl     %eax,%ebx
-
-    //movl        %eax,OFFSET hpatch1+2        // convice tasm to modify code...
-    movl    %ebx,hpatch1+2
-    //movl        %eax,OFFSET hpatch2+2        // convice tasm to modify code...
-    movl    %ebx,hpatch2+2
-    movl    %esi,hpatch3+2
-    movl    %esi,hpatch4+2
-// %eax      aligned colormap
-// %ebx      aligned colormap
-// %ecx,%edx  scratch
-// %esi      virtual source
-// %edi      moving destination pointer
-// %ebp      frac
-    movl    C(ds_colormap),%eax
-//    shld    $22,%ebp,%ecx           // begin calculating third pixel (y units)
-//    shld    $6,%ebp,%ecx            // begin calculating third pixel (x units)
-     movl    %ebp,%ecx
-    addl    %ebx,%ebp               // advance frac pointer
-     shrw    $10,%cx
-     roll    $6,%ecx
-    andl    $4095,%ecx              // finish calculation for third pixel
-//    shld    $22,%ebp,%edx           // begin calculating fourth pixel (y units)
-//    shld    $6,%ebp,%edx            // begin calculating fourth pixel (x units)
-     movl    %ebp,%edx
-     shrw    $10,%dx
-     roll    $6,%edx
-    addl    %ebx,%ebp               // advance frac pointer
-    andl    $4095,%edx              // finish calculation for fourth pixel
-    movl    %eax,%ebx
-    movb    (%esi,%ecx),%al         // get first pixel
-    movb    (%esi,%edx),%bl         // get second pixel
-    testl   $0x0fffffffe,C(pixelcount)
-    movb    (%eax),%dl             // color translate first pixel
-
-//    jnz hdoubleloop             // at least two pixels to map
-//    jmp hchecklast
-
-//    movw $0xf0f0,%dx //see visplanes start
-
-    jz      hchecklast
-    movb    (%ebx),%dh              // color translate second pixel
-    movl    C(loopcount),%esi
-//    .align  4
-hdoubleloop:
-//    shld    $22,%ebp,%ecx        // begin calculating third pixel (y units)
-//    shld    $6,%ebp,%ecx         // begin calculating third pixel (x units)
-    movl    %ebp,%ecx
-    shrw    $10,%cx
-    roll    $6,%ecx
-hpatch1:
-    addl    $0x012345678,%ebp    // advance frac pointer
-    movw    %dx,(%edi)           // write first pixel
-    andl    $4095,%ecx           // finish calculation for third pixel
-//    shld    $22,%ebp,%edx        // begin calculating fourth pixel (y units)
-//    shld    $6,%ebp,%edx         // begin calculating fourth pixel (x units)
-    movl    %ebp,%edx
-    shrw    $10,%dx
-    roll    $6,%edx
-hpatch3:
-    movb    0x012345678(%ecx),%al      // get third pixel
-//    movb    %bl,1(%edi)          // write second pixel
-    andl    $4095,%edx           // finish calculation for fourth pixel
-hpatch2:
-    addl    $0x012345678,%ebp    // advance frac pointer
-hpatch4:
-    movb    0x012345678(%edx),%bl      // get fourth pixel
-    movb    (%eax),%dl           // color translate third pixel
-    addl    $2,%edi              // advance to third pixel destination
-    decl    %esi                 // done with loop?
-    movb    (%ebx),%dh           // color translate fourth pixel
-    jnz hdoubleloop
-
-// check for final pixel
-hchecklast:
-    testl   $1,C(pixelcount)
-    jz      hdone
-    movb    %dl,(%edi)           // write final pixel
-
-hdone:
-    popl    %ebx                 // restore register variables
-    popl    %edi
-    popl    %esi
-    popl    %ebp                 // restore caller's stack frame pointer
-    ret
-
-
-//.endif
-
-
-//----------------------------------------------------------------------
-// R_DrawTransColumn
-//
-// Vertical column texture drawer, with transparency. Replaces Doom2's
-// 'fuzz' effect, which was not so beautiful.
-// Transparency is always impressive in some way, don't know why...
-//----------------------------------------------------------------------
-
-#ifdef LINUX
-    .align 2
-#else
-    .align 5
-#endif
-
-.globl C(R_DrawTranslucentColumn_8)
-C(R_DrawTranslucentColumn_8):
-    pushl   %ebp                // preserve caller's stack frame pointer
-    pushl   %esi                // preserve register variables
-    pushl   %edi
-    pushl   %ebx
-
-//
-// dest = ylookup[dc_yl] + columnofs[dc_x];
-//
-    movl     C(dc_yl),%ebp
-    movl     %ebp,%ebx
-    movl     C(ylookup)(,%ebx,4),%edi
-    movl     C(dc_x),%ebx
-    addl     C(columnofs)(,%ebx,4),%edi  // edi = dest
-
-//
-// pixelcount = yh - yl + 1
-//
-    movl     C(dc_yh),%eax
-    incl     %eax
-    subl     %ebp,%eax                   // pixel count
-    movl     %eax,C(pixelcount)          // save for final pixel
-    jle      vtdone                       // nothing to scale
-
-//
-// frac = dc_texturemid - (centery-dc_yl)*fracstep;
-//
-    movl     C(dc_iscale),%ecx           // fracstep
-    movl     C(centery),%eax
-    subl     %ebp,%eax
-    imul     %ecx,%eax
-    movl     C(dc_texturemid),%edx
-    subl     %eax,%edx
-    movl     %edx,%ebx
-
-    shrl     $16,%ebx          // frac int.
-    andl     $0x0000007f,%ebx
-    shll     $16,%edx          // y frac up
-
-    movl     %ecx,%ebp
-    shll     $16,%ebp          // fracstep f. up
-    shrl     $16,%ecx          // fracstep i. ->cl
-    andb     $0x7f,%cl
-    pushw    %cx
-    movl     %edx,%ecx
-    popw     %cx
-    movl     C(dc_colormap),%edx
-    movl     C(dc_source),%esi
-
-//
-// lets rock :) !
-//
-    movl    C(pixelcount),%eax
-    shrl    $2,%eax
-    testb   $0x03,C(pixelcount)
-    movb    %al,%ch             // quad count
-    movl    C(dc_transmap),%eax
-    jz      vt4quadloop
-//
-//  do un-even pixel
-//
-    testb   $1,C(pixelcount)
-    jz      2f
-
-    movb    (%esi,%ebx),%ah      // fetch texel : colormap number
-     addl    %ebp,%ecx
-    adcb    %cl,%bl
-     movb    (%edi),%al           // fetch dest  : index into colormap
-    andb    $0x7f,%bl
-     movb    (%eax),%dl
-    movb    (%edx), %dl          // use colormap now !
-    movb    %dl,(%edi)
-     addl    C(vidwidth),%edi
-//
-//  do two non-quad-aligned pixels
-//
-2:
-    testb   $2,C(pixelcount)
-    jz      3f
-
-    movb    (%esi,%ebx),%ah      // fetch texel : colormap number
-     addl    %ebp,%ecx
-    adcb    %cl,%bl
-     movb    (%edi),%al           // fetch dest  : index into colormap
-    andb    $0x7f,%bl
-     movb    (%eax),%dl
-    movb    (%edx), %dl          // use colormap now !
-    movb    %dl,(%edi)
-     addl    C(vidwidth),%edi
-
-    movb    (%esi,%ebx),%ah      // fetch texel : colormap number
-     addl    %ebp,%ecx
-    adcb    %cl,%bl
-     movb    (%edi),%al           // fetch dest  : index into colormap
-    andb    $0x7f,%bl
-     movb    (%eax),%dl
-    movb    (%edx), %dl          // use colormap now !
-    movb    %dl,(%edi)
-     addl    C(vidwidth),%edi
-
-//
-//  test if there was at least 4 pixels
-//
-3:
-    testb   $0xFF,%ch           // test quad count
-    jz      vtdone
-
-//
-// tystep : ystep frac. upper 24 bits
-// edx : upper 24 bit : colomap
-//  dl : tmp pixel to write
-// ebx : y     i.    lower 7 bits,  masked for index
-// ecx : y     frac. upper 16 bits
-// ecx : ch = counter, cl = y step i.
-// eax : transmap aligned 65535 (upper 16 bit)
-//  ah : background pixel (from the screen buffer)
-//  al : foreground pixel (from the texture)
-// esi : source texture column
-// ebp,edi : dest screen
-//
-vt4quadloop:
-    movb    (%esi,%ebx),%ah      // fetch texel : colormap number
-p5: movb    0x12345678(%edi),%al           // fetch dest  : index into colormap
-
-    movl    %ebp,C(tystep)
-    movl    %edi,%ebp
-    subl    C(vidwidth),%edi
-    jmp inloop
-//    .align  4
-vtquadloop:
-    addl    C(tystep),%ecx
-    adcb    %cl,%bl
-p6: addl    $2*0x12345678,%ebp
-    andb    $0x7f,%bl
-    movb    (%eax),%dl
-    movb    (%esi,%ebx),%ah      // fetch texel : colormap number
-    movb    (%edx), %dl          // use colormap now !
-    movb    %dl,(%edi)
-    movb    (%ebp),%al           // fetch dest  : index into colormap
-inloop:
-    addl    C(tystep),%ecx
-    adcb    %cl,%bl
-p7: addl    $2*0x12345678,%edi
-    andb    $0x7f,%bl
-    movb    (%eax),%dl
-    movb    (%esi,%ebx),%ah      // fetch texel : colormap number
-    movb    (%edx), %dl          // use colormap now !
-    movb    %dl,(%ebp)
-    movb    (%edi),%al           // fetch dest  : index into colormap
-
-    addl    C(tystep),%ecx
-    adcb    %cl,%bl
-p8: addl    $2*0x12345678,%ebp
-    andb    $0x7f,%bl
-    movb    (%eax),%dl
-    movb    (%esi,%ebx),%ah      // fetch texel : colormap number
-    movb    (%edx), %dl          // use colormap now !
-    movb    %dl,(%edi)
-    movb    (%ebp),%al           // fetch dest  : index into colormap
-
-    addl    C(tystep),%ecx
-    adcb    %cl,%bl
-p9: addl    $2*0x12345678,%edi
-    andb    $0x7f,%bl
-    movb    (%eax),%dl
-    movb    (%esi,%ebx),%ah      // fetch texel : colormap number
-    movb    (%edx), %dl          // use colormap now !
-    movb    %dl,(%ebp)
-    movb    (%edi),%al           // fetch dest  : index into colormap
-
-    decb   %ch
-     jnz    vtquadloop
-
-vtdone:
-    popl    %ebx                // restore register variables
-    popl    %edi
-    popl    %esi
-    popl    %ebp                // restore caller's stack frame pointer
-    ret
-
-#endif // ifdef USEASM
-
-
-
-//----------------------------------------------------------------------
-// R_DrawShadeColumn
-//
-//   for smoke..etc.. test.
-//----------------------------------------------------------------------
-
-#ifdef LINUX
-    .align 2
-#else
-    .align 5
-#endif
-.globl C(R_DrawShadeColumn_8)
-C(R_DrawShadeColumn_8):
-    pushl   %ebp                // preserve caller's stack frame pointer
-    pushl   %esi                // preserve register variables
-    pushl   %edi
-    pushl   %ebx
-
-//
-// dest = ylookup[dc_yl] + columnofs[dc_x];
-//
-    movl     C(dc_yl),%ebp
-    movl     %ebp,%ebx
-    movl     C(ylookup)(,%ebx,4),%edi
-    movl     C(dc_x),%ebx
-    addl     C(columnofs)(,%ebx,4),%edi  // edi = dest
-
-//
-// pixelcount = yh - yl + 1
-//
-    movl     C(dc_yh),%eax
-    incl     %eax
-    subl     %ebp,%eax                   // pixel count
-    movl     %eax,C(pixelcount)          // save for final pixel
-    jle      shdone                       // nothing to scale
-
-//
-// frac = dc_texturemid - (centery-dc_yl)*fracstep;
-//
-    movl     C(dc_iscale),%ecx           // fracstep
-    movl     C(centery),%eax
-    subl     %ebp,%eax
-    imul     %ecx,%eax
-    movl     C(dc_texturemid),%edx
-    subl     %eax,%edx
-     movl     %edx,%ebx
-     shrl     $16,%ebx          // frac int.
-     andl     $0x0000007f,%ebx
-     shll     $16,%edx          // y frac up
-
-     movl     %ecx,%ebp
-     shll     $16,%ebp          // fracstep f. up
-     shrl     $16,%ecx          // fracstep i. ->cl
-     andb     $0x7f,%cl
-
-    movl     C(dc_source),%esi
-
-//
-// lets rock :) !
-//
-    movl    C(pixelcount),%eax
-    movb    %al,%dh
-    shrl    $2,%eax
-    movb    %al,%ch             // quad count
-    movl    C(colormaps),%eax
-    testb   $0x03,%dh
-    jz      sh4quadloop
-
-//
-//  do un-even pixel
-//
-    testb   $1,%dh
-    jz      2f
-
-    movb    (%esi,%ebx),%ah      // fetch texel : colormap number
-     addl    %ebp,%edx
-    adcb    %cl,%bl
-     movb    (%edi),%al           // fetch dest  : index into colormap
-    andb    $0x7f,%bl
-     movb    (%eax),%dl
-    movb    %dl,(%edi)
-     addl    C(vidwidth),%edi
-
-//
-//  do two non-quad-aligned pixels
-//
-2:
-    testb   $2,%dh
-    jz      3f
-
-    movb    (%esi,%ebx),%ah      // fetch texel : colormap number
-     addl    %ebp,%edx
-    adcb    %cl,%bl
-     movb    (%edi),%al           // fetch dest  : index into colormap
-    andb    $0x7f,%bl
-     movb    (%eax),%dl
-    movb    %dl,(%edi)
-     addl    C(vidwidth),%edi
-
-    movb    (%esi,%ebx),%ah      // fetch texel : colormap number
-     addl    %ebp,%edx
-    adcb    %cl,%bl
-     movb    (%edi),%al           // fetch dest  : index into colormap
-    andb    $0x7f,%bl
-     movb    (%eax),%dl
-    movb    %dl,(%edi)
-     addl    C(vidwidth),%edi
-
-//
-//  test if there was at least 4 pixels
-//
-3:
-    testb   $0xFF,%ch           // test quad count
-    jz      shdone
-
-//
-// ebp : ystep frac. upper 24 bits
-// edx : y     frac. upper 24 bits
-// ebx : y     i.    lower 7 bits,  masked for index
-// ecx : ch = counter, cl = y step i.
-// eax : colormap aligned 256
-// esi : source texture column
-// edi : dest screen
-//
-sh4quadloop:
-    movb    $0x7f,%dh           // prep mask
-
-    movb    (%esi,%ebx),%ah      // fetch texel : colormap number
-sh5:    movb    0x12345678(%edi),%al           // fetch dest  : index into colormap
-
-    movl    %ebp,C(tystep)
-    movl    %edi,%ebp
-    subl    C(vidwidth),%edi
-    jmp shinloop
-//    .align  4
-shquadloop:
-    addl    C(tystep),%edx
-    adcb    %cl,%bl
-    andb    %dh,%bl
-sh6:    addl    $2*0x12345678,%ebp
-    movb    (%eax),%dl
-    movb    (%esi,%ebx),%ah      // fetch texel : colormap number
-    movb    %dl,(%edi)
-    movb    (%ebp),%al           // fetch dest  : index into colormap
-shinloop:
-    addl    C(tystep),%edx
-    adcb    %cl,%bl
-    andb    %dh,%bl
-sh7:    addl    $2*0x12345678,%edi
-    movb    (%eax),%dl
-    movb    (%esi,%ebx),%ah      // fetch texel : colormap number
-    movb    %dl,(%ebp)
-    movb    (%edi),%al           // fetch dest  : index into colormap
-
-    addl    C(tystep),%edx
-    adcb    %cl,%bl
-    andb    %dh,%bl
-sh8:    addl    $2*0x12345678,%ebp
-    movb    (%eax),%dl
-    movb    (%esi,%ebx),%ah      // fetch texel : colormap number
-    movb    %dl,(%edi)
-    movb    (%ebp),%al           // fetch dest  : index into colormap
-
-    addl    C(tystep),%edx
-    adcb    %cl,%bl
-    andb    %dh,%bl
-sh9:    addl    $2*0x12345678,%edi
-    movb    (%eax),%dl
-    movb    (%esi,%ebx),%ah      // fetch texel : colormap number
-    movb    %dl,(%ebp)
-    movb    (%edi),%al           // fetch dest  : index into colormap
-
-    decb   %ch
-     jnz    shquadloop
-
-shdone:
-    popl    %ebx                // restore register variables
-    popl    %edi
-    popl    %esi
-    popl    %ebp                // restore caller's stack frame pointer
-    ret
-
-
-
-//----------------------------------------------------------------------
-//
-//  R_DrawWaterColumn : basically it's just a copy of R_DrawColumn,
-//                      but it uses dc_colormap from dc_yl to dc_yw-1
-//                      then it uses dc_wcolormap from dc_yw to dc_yh
-//
-//  Thus, the 'underwater' part of the walls is remapped to 'water-like'
-//  colors.
-//
-//----------------------------------------------------------------------
-
-#ifdef LINUX
-    .align 2
-#else
-    .align 5
-#endif
-.globl C(R_DrawWaterColumn)
-C(R_DrawWaterColumn):
-    pushl   %ebp                // preserve caller's stack frame pointer
-    pushl   %esi                // preserve register variables
-    pushl   %edi
-    pushl   %ebx
-
-//
-// dest = ylookup[dc_yl] + columnofs[dc_x];
-//
-    movl     C(dc_yl),%ebp
-    movl     %ebp,%ebx
-    movl     C(ylookup)(,%ebx,4),%edi
-    movl     C(dc_x),%ebx
-    addl     C(columnofs)(,%ebx,4),%edi  // edi = dest
-
-//
-// pixelcount = yh - yl + 1
-//
-    movl     C(dc_yh),%eax
-    incl     %eax
-    subl     %ebp,%eax                   // pixel count
-    movl     %eax,C(pixelcount)          // save for final pixel
-    jle      wdone                       // nothing to scale
-
-//
-// frac = dc_texturemid - (centery-dc_yl)*fracstep;
-//
-    movl     C(dc_iscale),%ecx           // fracstep
-    movl     C(centery),%eax
-    subl     %ebp,%eax
-    imul     %ecx,%eax
-    movl     C(dc_texturemid),%edx
-    subl     %eax,%edx
-     movl     %edx,%ebx
-     shrl     $16,%ebx          // frac int.
-     andl     $0x0000007f,%ebx
-     shll     $16,%edx          // y frac up
-
-     movl     %ecx,%ebp
-     shll     $16,%ebp          // fracstep f. up
-     shrl     $16,%ecx          // fracstep i. ->cl
-     andb     $0x7f,%cl
-
-    movl     C(dc_source),%esi
-
-//
-// lets rock :) !
-//
-    movl    C(pixelcount),%eax
-    movb    %al,%dh
-    shrl    $2,%eax
-    movb    %al,%ch             // quad count
-    movl    C(dc_wcolormap),%eax
-    testb   $3,%dh
-    jz      w4quadloop
-
-//
-//  do un-even pixel
-//
-    testb   $1,%dh
-    jz      2f
-
-    movb    (%esi,%ebx),%al     // prep un-even loops
-     addl    %ebp,%edx            // ypos f += ystep f
-    adcb    %cl,%bl              // ypos i += ystep i
-     movb    (%eax),%dl           // colormap texel
-    andb    $0x7f,%bl            // mask 0-127 texture index
-     movb    %dl,(%edi)           // output pixel
-    addl    C(vidwidth),%edi
-
-//
-//  do two non-quad-aligned pixels
-//
-2:
-    testb   $2,%dh
-    jz      3f
-
-    movb    (%esi,%ebx),%al      // fetch source texel
-     addl    %ebp,%edx            // ypos f += ystep f
-    adcb    %cl,%bl              // ypos i += ystep i
-     movb    (%eax),%dl           // colormap texel
-    andb    $0x7f,%bl            // mask 0-127 texture index
-     movb    %dl,(%edi)           // output pixel
-
-    movb    (%esi,%ebx),%al      // fetch source texel
-     addl    %ebp,%edx            // ypos f += ystep f
-    adcb    %cl,%bl              // ypos i += ystep i
-     movb    (%eax),%dl           // colormap texel
-    andb    $0x7f,%bl            // mask 0-127 texture index
-    addl    C(vidwidth),%edi
-     movb    %dl,(%edi)           // output pixel
-
-    addl    C(vidwidth),%edi
-
-//
-//  test if there was at least 4 pixels
-//
-3:
-    testb   $0xFF,%ch           // test quad count
-    jz      wdone
-
-//
-// ebp : ystep frac. upper 24 bits
-// edx : y     frac. upper 24 bits
-// ebx : y     i.    lower 7 bits,  masked for index
-// ecx : ch = counter, cl = y step i.
-// eax : colormap aligned 256
-// esi : source texture column
-// edi : dest screen
-//
-w4quadloop:
-    movb    $0x7f,%dh           // prep mask
-//    .align  4
-wquadloop:
-    movb    (%esi,%ebx),%al     // prep loop
-     addl    %ebp,%edx            // ypos f += ystep f
-    adcb    %cl,%bl              // ypos i += ystep i
-     movb    (%eax),%dl           // colormap texel
-    movb    %dl,(%edi)           // output pixel
-     andb    $0x7f,%bl            // mask 0-127 texture index
-
-    movb    (%esi,%ebx),%al      // fetch source texel
-     addl    %ebp,%edx
-    adcb    %cl,%bl
-     movb    (%eax),%dl
-w1:    movb    %dl,0x12345678(%edi)
-     andb    $0x7f,%bl
-
-    movb    (%esi,%ebx),%al      // fetch source texel
-     addl    %ebp,%edx
-    adcb    %cl,%bl
-     movb    (%eax),%dl
-w2:    movb    %dl,2*0x12345678(%edi)
-     andb    $0x7f,%bl
-
-    movb    (%esi,%ebx),%al      // fetch source texel
-     addl    %ebp,%edx
-    adcb    %cl,%bl
-     movb    (%eax),%dl
-w3:    movb    %dl,3*0x12345678(%edi)
-     andb    $0x7f,%bl
-
-w4:    addl    $4*0x12345678,%edi
-
-    decb   %ch
-     jnz    wquadloop
-
-wdone:
-    popl    %ebx                // restore register variables
-    popl    %edi
-    popl    %esi
-    popl    %ebp                // restore caller's stack frame pointer
-    ret
-
-
-
-
-
-
-
-//----------------------------------------------------------------------
-//
-//  R_DrawSpanNoWrap
-//
-//      Horizontal texture mapping, does not remap colors,
-//      neither needs to wrap around the source texture.
-//
-//      Thus, a special optimisation can be used...
-//
-//----------------------------------------------------------------------
-
-    .data
-
-advancetable:   .long   0, 0
-#if !defined( LINUX)
-    .text
-#endif
-#ifdef LINUX
-    .align 2
-#else
-    .align 4
-#endif
-.globl C(R_DrawSpanNoWrap)
-C(R_DrawSpanNoWrap):
-    pushl   %ebp                // preserve caller's stack frame pointer
-    pushl   %esi                // preserve register variables
-    pushl   %edi
-    pushl   %ebx
-
-//
-// find loop count
-//
-
-    movl    C(ds_x2),%eax
-    incl    %eax
-    subl    C(ds_x1),%eax               // pixel count
-    movl    %eax,C(pixelcount)          // save for final pixel
-    jle     htvdone                       // nothing to scale
-//    shrl    $1,%eax                     // double pixel count
-//    movl    %eax,C(loopcount)
-
-//
-// calculate screen dest
-//
-
-    movl    C(ds_y),%edi        //full destination start address
-
-//
-// set up advancetable
-//
-
-    movl    C(ds_xstep),%ebp
-    movl    C(ds_ystep),%ecx
-    movl    %ecx,%eax
-    movl    %ebp,%edx
-    sarl    $16,%edx            // xstep >>= 16;
-    movl    C(vidwidth),%ebx
-    sarl    $16,%eax            // ystep >>= 16;
-    jz      0f
-    imull   %ebx,%eax           // (ystep >> 16) * texwidth;
-0:
-    addl    %edx,%eax           // add in xstep
-                                // (ystep >> 16) * texwidth + (xstep >> 16);
-
-    movl    %eax,advancetable+4 // advance base in y
-    addl    %ebx,%eax           // ((ystep >> 16) + 1) * texwidth +
-                                //  (xstep >> 16);
-    movl    %eax,advancetable   // advance extra in y
-
-    shll    $16,%ebp            // left-justify xstep fractional part
-    movl    %ebp,xstep
-    shll    $16,%ecx            // left-justify ystep fractional part
-    movl    %ecx,ystep
-
-//
-// calculate the texture starting address
-//
-    movl    C(ds_source),%esi       // texture source
-
-     movl    C(ds_yfrac),%eax
-     movl    %eax,%edx
-     sarl    $16,%eax
-    movl    C(ds_xfrac),%ecx
-     imull   %ebx,%eax               // (yfrac >> 16) * texwidth
-    movl    %ecx,%ebx
-    sarl    $16,%ecx
-    movl    %ecx,%ebp
-     addl    %eax,%ebp               // source = (xfrac >> 16) +
-                                    //           ((yfrac >> 16) * texwidth);
-
-//
-//  esi : texture source
-//  edi : screen dest
-//  eax : colormap aligned on 256 boundary, hehehe...
-//  ebx : xfrac << 16
-//  ecx : used in loop, contains either 0 or -1, *4, offset into advancetable
-//  edx : yfrac << 16
-//  ebp : offset into texture
-//
-
-    shll    $16,%edx             // yfrac upper word, lower byte will be used
-    movl    C(ds_colormap),%eax
-    shll    $16,%ebx             // xfrac upper word, lower unused
-
-    movl    C(pixelcount),%ecx
-    shrl    $2,%ecx
-    movb    %cl,%dh             // quad pixels count
-
-    movl    C(pixelcount),%ecx
-    andl    $3,%ecx
-    jz      htvquadloop         // pixelcount is multiple of 4
-    decl    %ecx
-    jz      1f
-    decl    %ecx
-    jz      2f
-
-//
-//  do one to three pixels first
-//
-    addl    ystep,%edx          // yfrac += ystep
-   sbbl    %ecx,%ecx           // turn carry into 0 or -1 if set
-    movb    (%esi,%ebp),%al          // get texture pixel
-   addl    xstep,%ebx           // xfrac += xstep
-//    movb    (%eax),%dl           // pixel goes through colormap
-   adcl    advancetable+4(,%ecx,4),%ebp       // advance source
-    movb    %al,(%edi)           // write pixel dest
-
-   incl    %edi
-
-2:
-    addl    ystep,%edx          // yfrac += ystep
-   sbbl    %ecx,%ecx           // turn carry into 0 or -1 if set
-    movb    (%esi,%ebp),%al          // get texture pixel
-   addl    xstep,%ebx           // xfrac += xstep
-//    movb    (%eax),%dl           // pixel goes through colormap
-   adcl    advancetable+4(,%ecx,4),%ebp       // advance source
-    movb    %al,(%edi)           // write pixel dest
-
-   incl    %edi
-
-1:
-    addl    ystep,%edx          // yfrac += ystep
-   sbbl    %ecx,%ecx           // turn carry into 0 or -1 if set
-    movb    (%esi,%ebp),%al          // get texture pixel
-   addl    xstep,%ebx           // xfrac += xstep
-//    movb    (%eax),%dl           // pixel goes through colormap
-   adcl    advancetable+4(,%ecx,4),%ebp       // advance source
-    movb    %al,(%edi)           // write pixel dest
-
-   incl    %edi
-
-//
-//  test if there was at least 4 pixels
-//
-    testb   $0xFF,%dh
-    jz      htvdone
-
-//
-//  two pixels per loop
-// U
-//  V
-htvquadloop:
-    addl    ystep,%edx             // yfrac += ystep
-   sbbl    %ecx,%ecx               // turn carry into 0 or -1 if set
-    movb    (%esi,%ebp),%al        // get texture pixel
-   addl    xstep,%ebx              // xfrac += xstep
-//    movb    (%eax),%dl             // pixel goes through colormap
-   adcl    advancetable+4(,%ecx,4),%ebp       // advance source
-    movb    %al,(%edi)             // write pixel dest
-
-    addl    ystep,%edx
-   sbbl    %ecx,%ecx
-    movb    (%esi,%ebp),%al
-   addl    xstep,%ebx
-//    movb    (%eax),%dl
-   adcl    advancetable+4(,%ecx,4),%ebp
-    movb    %al,1(%edi)
-
-    addl    ystep,%edx
-   sbbl    %ecx,%ecx
-    movb    (%esi,%ebp),%al
-   addl    xstep,%ebx
-//    movb    (%eax),%dl
-   adcl    advancetable+4(,%ecx,4),%ebp
-    movb    %al,2(%edi)
-
-    addl    ystep,%edx
-   sbbl    %ecx,%ecx
-    movb    (%esi,%ebp),%al
-   addl    xstep,%ebx
-//    movb    (%eax),%dl
-   adcl    advancetable+4(,%ecx,4),%ebp
-    movb    %al,3(%edi)
-
-   addl    $4, %edi
-    incl    %ecx    //dummy
-
-   decb   %dh
-    jnz    htvquadloop          // paire dans V-pipe
-
-htvdone:
-    popl    %ebx                // restore register variables
-    popl    %edi
-    popl    %esi
-    popl    %ebp                // restore caller's stack frame pointer
-    ret
-
-
-//.endif
-
-#ifdef HORIZONTALDRAW
-// void R_RotateBuffere (void)
-
-#ifdef LINUX
-    .align 2
-#else
-    .align 4
-#endif
-.globl C(R_RotateBufferasm)
-C(R_RotateBufferasm):
-    pushl   %ebp                // preserve caller's stack frame pointer
-    pushl   %esi                // preserve register variables
-    pushl   %edi
-    pushl   %ebx
-
-    movl    C(dc_source),%esi
-    movl    C(dc_colormap),%edi
-
-    movl    $200,%edx
-ra2:
-    movl    $40,%ecx
-ra:
-    movb    -2*200(%esi),%al
-    movb    -6*200(%esi),%bl
-    movb    -3*200(%esi),%ah
-    movb    -7*200(%esi),%bh
-    shll    $16,%eax
-    shll    $16,%ebx
-    movb    (%esi),%al
-    movb    -4*200(%esi),%bl
-    movb    -1*200(%esi),%ah
-    movb    -5*200(%esi),%bh
-    movl    %eax,(%edi)
-    subl    $8*200,%esi
-    movl    %ebx,4(%edi)
-    addl    $8,%edi
-    decl    %ecx
-    jnz     ra
-
-    addl    $320*200+1,%esi      //32*480 passe a la ligne suivante
-//    addl    320-32,%edi
-
-    decl    %edx
-    jnz     ra2
-
-    pop   %ebp                // preserve caller's stack frame pointer
-    pop   %esi                // preserve register variables
-    pop   %edi
-    pop   %ebx
-    ret
-#endif
--- a/src/tmap_asm.s
+++ b/src/tmap_asm.s
-// SONIC ROBO BLAST 2
-//-----------------------------------------------------------------------------
-// Copyright (C) 1998-2000 by DooM Legacy Team.
-// Copyright (C) 1999-2021 by Sonic Team Junior.
-//
-// This program is free software distributed under the
-// terms of the GNU General Public License, version 2.
-// See the 'LICENSE' file for more details.
-//-----------------------------------------------------------------------------
-/// \file  tmap_asm.s
-/// \brief ???
-
-//.comm _dc_colormap,4
-//.comm _dc_x,4
-//.comm _dc_yl,4
-//.comm _dc_yh,4
-//.comm _dc_iscale,4
-//.comm _dc_texturemid,4
-//.comm _dc_source,4
-//.comm _ylookup,4
-//.comm _columnofs,4
-//.comm _loopcount,4
-//.comm _pixelcount,4
-.data
-_pixelcount:
-.long 0x00000000
-_loopcount:
-.long 0x00000000
-.align 8
-_mmxcomm:
-.long 0x00000000
-.text
-
-        .align 4
-.globl _R_DrawColumn8_NOMMX
-_R_DrawColumn8_NOMMX:
-   pushl %ebp
-   pushl %esi
-   pushl %edi
-   pushl %ebx
-	movl _dc_yl,%edx
-	movl _dc_yh,%eax
-	subl %edx,%eax
-	leal 1(%eax),%ebx
-	testl %ebx,%ebx
-	jle rdc8ndone
-	movl _dc_x,%eax
-        movl _ylookup, %edi
-	movl (%edi,%edx,4),%esi
-	movl _columnofs, %edi
-	addl (%edi,%eax,4),%esi
-	movl _dc_iscale,%edi
-	movl %edx,%eax
-	imull %edi,%eax
-	movl _dc_texturemid,%ecx
-	addl %eax,%ecx
-
-	movl _dc_source,%ebp
-   xorl %edx, %edx
-   subl $0x12345678, %esi
-.globl rdc8nwidth1
-rdc8nwidth1:
-	.align 4,0x90
-rdc8nloop:
-	movl %ecx,%eax
-	shrl $16,%eax
-	addl %edi,%ecx
-	andl $127,%eax
-	addl $0x12345678,%esi
-.globl rdc8nwidth2
-rdc8nwidth2:
-	movb (%eax,%ebp),%dl
-	movl _dc_colormap,%eax
-	movb (%eax,%edx),%al
-	movb %al,(%esi)
-	decl %ebx
-	jne rdc8nloop
-rdc8ndone:
-   popl %ebx
-   popl %edi
-   popl %esi
-   popl %ebp
-   ret
-
-//
-// Optimised specifically for P54C/P55C (aka Pentium with/without MMX)
-// By ES 1998/08/01
-//
-
-.globl _R_DrawColumn_8_Pentium
-_R_DrawColumn_8_Pentium:
-	pushl %ebp
-        pushl %ebx
-	pushl %esi
-        pushl %edi
-	movl _dc_yl,%eax        // Top pixel
-	movl _dc_yh,%ebx        // Bottom pixel
-        movl _ylookup, %edi
-	movl (%edi,%ebx,4),%ecx
-	subl %eax,%ebx          // ebx=number of pixels-1
-	jl rdc8pdone            // no pixel to draw, done
-	jnz rdc8pmany
-	movl _dc_x,%edx         // Special case: only one pixel
-        movl _columnofs, %edi
-	addl (%edi,%edx,4),%ecx // dest pixel at (%ecx)
-	movl _dc_iscale,%esi
-	imull %esi,%eax
-	movl _dc_texturemid,%edi
-	addl %eax,%edi          // texture index in edi
-	movl _dc_colormap,%edx
-   	shrl $16, %edi
-   	movl _dc_source,%ebp
-	andl $127,%edi
-	movb (%edi,%ebp),%dl    // read texture pixel
-	movb (%edx),%al	        // lookup for light
-	movb %al,0(%ecx) 	// write it
-	jmp rdc8pdone		// done!
-.align 4, 0x90
-rdc8pmany:			// draw >1 pixel
-	movl _dc_x,%edx
-        movl _columnofs, %edi
-	movl (%edi,%edx,4),%edx
-	leal 0x12345678(%edx, %ecx), %edi  // edi = two pixels above bottom
-.globl rdc8pwidth5
-rdc8pwidth5:  // DeadBeef = -2*SCREENWIDTH
-        movl _dc_iscale,%edx	// edx = fracstep
-	imull %edx,%eax
-   	shll $9, %edx           // fixme: Should get 7.25 fix as input
-	movl _dc_texturemid,%ecx
-	addl %eax,%ecx          // ecx = frac
-	movl _dc_colormap,%eax  // eax = lighting/special effects LUT
-   	shll $9, %ecx
-   	movl _dc_source,%esi    // esi = source ptr
-
-	imull $0x12345678, %ebx // ebx = negative offset to pixel
-.globl rdc8pwidth6
-rdc8pwidth6:  // DeadBeef = -SCREENWIDTH
-
-// Begin the calculation of the two first pixels
-        leal (%ecx, %edx), %ebp
-	shrl $25, %ecx
-	movb (%esi, %ecx), %al
-	leal (%edx, %ebp), %ecx
-	shrl $25, %ebp
-        movb (%eax), %dl
-
-// The main loop
-rdc8ploop:
-	movb (%esi,%ebp), %al		// load 1
-        leal (%ecx, %edx), %ebp         // calc frac 3
-
-	shrl $25, %ecx                  // shift frac 2
-        movb %dl, 0x12345678(%edi, %ebx)// store 0
-.globl rdc8pwidth1
-rdc8pwidth1:  // DeadBeef = 2*SCREENWIDTH
-
-        movb (%eax), %al                // lookup 1
-
-        movb %al, 0x12345678(%edi, %ebx)// store 1
-.globl rdc8pwidth2
-rdc8pwidth2:  // DeadBeef = 3*SCREENWIDTH
-        movb (%esi, %ecx), %al          // load 2
-
-        leal (%ebp, %edx), %ecx         // calc frac 4
-
-        shrl $25, %ebp                  // shift frac 3
-        movb (%eax), %dl                // lookup 2
-
-        addl $0x12345678, %ebx          // counter
-.globl rdc8pwidth3
-rdc8pwidth3:  // DeadBeef = 2*SCREENWIDTH
-        jl rdc8ploop                    // loop
-
-// End of loop. Write extra pixel or just exit.
-        jnz rdc8pdone
-        movb %dl, 0x12345678(%edi, %ebx)// Write odd pixel
-.globl rdc8pwidth4
-rdc8pwidth4:  // DeadBeef = 2*SCREENWIDTH
-
-rdc8pdone:
-
-        popl %edi
-	popl %esi
-        popl %ebx
-	popl %ebp
-        ret
-
-//
-// MMX asm version, optimised for K6
-// By ES 1998/07/05
-//
-
-.globl _R_DrawColumn_8_K6_MMX
-_R_DrawColumn_8_K6_MMX:
-	pushl %ebp
-        pushl %ebx
-	pushl %esi
-        pushl %edi
-
-        movl %esp, %eax // Push 8 or 12, so that (%esp) gets aligned by 8
-        andl $7,%eax
-        addl $8,%eax
-        movl %eax, _mmxcomm // Temp storage in mmxcomm: (%esp) is used instead
-        subl %eax,%esp
-
-	movl _dc_yl,%edx        // Top pixel
-	movl _dc_yh,%ebx        // Bottom pixel
-        movl _ylookup, %edi
-	movl (%edi,%ebx,4),%ecx
-	subl %edx,%ebx         // ebx=number of pixels-1
-	jl 0x12345678            // no pixel to draw, done
-.globl rdc8moffs1
-rdc8moffs1:
-	jnz rdc8mmany
-	movl _dc_x,%eax         // Special case: only one pixel
-        movl _columnofs, %edi
-	addl (%edi,%eax,4),%ecx  // dest pixel at (%ecx)
-	movl _dc_iscale,%esi
-	imull %esi,%edx
-	movl _dc_texturemid,%edi
-	addl %edx,%edi         // texture index in edi
-	movl _dc_colormap,%edx
-   	shrl $16, %edi
-   	movl _dc_source,%ebp
-	andl $127,%edi
-	movb (%edi,%ebp),%dl  // read texture pixel
-	movb (%edx),%al	 // lookup for light
-	movb %al,0(%ecx) 	 // write it
-	jmp rdc8mdone		 // done!
-.globl rdc8moffs2
-rdc8moffs2:
-.align 4, 0x90
-rdc8mmany:			 // draw >1 pixel
-	movl _dc_x,%eax
-        movl _columnofs, %edi
-	movl (%edi,%eax,4),%eax
-	leal 0x12345678(%eax, %ecx), %esi  // esi = two pixels above bottom
-.globl rdc8mwidth3
-rdc8mwidth3:  // DeadBeef = -2*SCREENWIDTH
-        movl _dc_iscale,%ecx	 // ecx = fracstep
-	imull %ecx,%edx
-   	shll $9, %ecx           // fixme: Should get 7.25 fix as input
-	movl _dc_texturemid,%eax
-	addl %edx,%eax         // eax = frac
-	movl _dc_colormap,%edx  // edx = lighting/special effects LUT
-   	shll $9, %eax
-	leal (%ecx, %ecx), %edi
-   	movl _dc_source,%ebp    // ebp = source ptr
-	movl %edi, 0(%esp)     // Start moving frac and fracstep to MMX regs
-
-	imull $0x12345678, %ebx  // ebx = negative offset to pixel
-.globl rdc8mwidth5
-rdc8mwidth5:  // DeadBeef = -SCREENWIDTH
-
-	movl %edi, 4(%esp)
-	leal (%eax, %ecx), %edi
-	movq 0(%esp), %mm1     // fracstep:fracstep in mm1
-	movl %eax, 0(%esp)
-	shrl $25, %eax
-	movl %edi, 4(%esp)
-	movzbl (%ebp, %eax), %eax
-	movq 0(%esp), %mm0     // frac:frac in mm0
-
-	paddd %mm1, %mm0
-	shrl $25, %edi
-	movq %mm0, %mm2
-	psrld $25, %mm2         // texture index in mm2
-	paddd %mm1, %mm0
-	movq %mm2, 0(%esp)
-
-.globl rdc8mloop
-rdc8mloop:                      		// The main loop
-	movq %mm0, %mm2                    // move 4-5 to temp reg
-	movzbl (%ebp, %edi), %edi 		// read 1
-
-	psrld $25, %mm2 			// shift 4-5
-	movb (%edx,%eax), %cl 		// lookup 0
-
-	movl 0(%esp), %eax 			// load 2
-	addl $0x12345678, %ebx 		// counter
-.globl rdc8mwidth2
-rdc8mwidth2:  // DeadBeef = 2*SCREENWIDTH
-
-	movb %cl, (%esi, %ebx)		// write 0
-	movb (%edx,%edi), %ch 		// lookup 1
-
-	movb %ch, 0x12345678(%esi, %ebx) 	// write 1
-.globl rdc8mwidth1
-rdc8mwidth1:  // DeadBeef = SCREENWIDTH
-	movl 4(%esp), %edi			// load 3
-
-	paddd %mm1, %mm0 			// frac 6-7
-	movzbl (%ebp, %eax), %eax 		// lookup 2
-
-	movq %mm2, 0(%esp) 		     // store texture index 4-5
-	jl rdc8mloop
-
-	jnz rdc8mno_odd
-	movb (%edx,%eax), %cl  // write the last odd pixel
-	movb %cl, 0x12345678(%esi)
-.globl rdc8mwidth4
-rdc8mwidth4:  // DeadBeef = 2*SCREENWIDTH
-rdc8mno_odd:
-
-.globl rdc8mdone
-rdc8mdone:
-        emms
-
-        addl _mmxcomm, %esp
-        popl %edi
-	popl %esi
-        popl %ebx
-	popl %ebp
-        ret
-
-// Need some extra space to align run-time
-.globl R_DrawColumn_8_K6_MMX_end
-R_DrawColumn_8_K6_MMX_end:
-nop;nop;nop;nop;nop;nop;nop;nop;
-nop;nop;nop;nop;nop;nop;nop;nop;
-nop;nop;nop;nop;nop;nop;nop;nop;
-nop;nop;nop;nop;nop;nop;nop;
--- a/src/tmap_mmx.nas
+++ b/src/tmap_mmx.nas
-;; SONIC ROBO BLAST 2
-;;-----------------------------------------------------------------------------
-;; Copyright (C) 1998-2000 by DOSDOOM.
-;; Copyright (C) 2010-2021 by Sonic Team Junior.
-;;
-;; This program is free software distributed under the
-;; terms of the GNU General Public License, version 2.
-;; See the 'LICENSE' file for more details.
-;;-----------------------------------------------------------------------------
-;; FILE:
-;;      tmap_mmx.nas
-;; DESCRIPTION:
-;;      Assembler optimised rendering code for software mode, using SIMD
-;;      instructions.
-;;      Draw wall columns.
-
-
-[BITS 32]
-
-%define FRACBITS 16
-%define TRANSPARENTPIXEL 255
-
-%ifdef LINUX
-%macro cextern 1
-[extern %1]
-%endmacro
-
-%macro cglobal 1
-[global %1]
-%endmacro
-
-%else
-%macro cextern 1
-%define %1 _%1
-[extern %1]
-%endmacro
-
-%macro cglobal 1
-%define %1 _%1
-[global %1]
-%endmacro
-
-%endif
-
-
-; The viddef_s structure. We only need the width field.
-struc viddef_s
-		resb 12
-.width: resb 4
-		resb 44
-endstruc
-
-
-;; externs
-;; columns
-cextern dc_colormap
-cextern dc_x
-cextern dc_yl
-cextern dc_yh
-cextern dc_iscale
-cextern dc_texturemid
-cextern dc_texheight
-cextern dc_source
-cextern dc_hires
-cextern centery
-cextern centeryfrac
-cextern dc_transmap
-
-cextern R_DrawColumn_8_ASM
-cextern R_Draw2sMultiPatchColumn_8_ASM
-
-;; spans
-cextern nflatshiftup
-cextern nflatxshift
-cextern nflatyshift
-cextern nflatmask
-cextern ds_xfrac
-cextern ds_yfrac
-cextern ds_xstep
-cextern ds_ystep
-cextern ds_x1
-cextern ds_x2
-cextern ds_y
-cextern ds_source
-cextern ds_colormap
-
-cextern ylookup
-cextern columnofs
-cextern vid
-
-[SECTION .data]
-
-nflatmask64		dq		0
-
-
-[SECTION .text]
-
-;;----------------------------------------------------------------------
-;;
-;; R_DrawColumn : 8bpp column drawer
-;;
-;; MMX column drawer.
-;;
-;;----------------------------------------------------------------------
-;; eax = accumulator
-;; ebx = colormap
-;; ecx = count
-;; edx = accumulator
-;; esi = source
-;; edi = dest
-;; ebp = vid.width
-;; mm0 = accumulator
-;; mm1 = heightmask, twice
-;; mm2 = 2 * fracstep, twice
-;; mm3 = pair of consecutive fracs
-;;----------------------------------------------------------------------
-
-
-cglobal R_DrawColumn_8_MMX
-R_DrawColumn_8_MMX:
-		push		ebp						;; preserve caller's stack frame pointer
-		push		esi						;; preserve register variables
-		push		edi
-		push		ebx
-
-;;
-;; Our algorithm requires that the texture height be a power of two.
-;; If not, fall back to the non-MMX drawer.
-;;
-.texheightcheck:
-		mov			edx, [dc_texheight]
-		sub			edx, 1					;; edx = heightmask
-		test		edx, [dc_texheight]
-		jnz			near .usenonMMX
-
-		mov			ebp, edx				;; Keep a copy of heightmask in a
-											;; GPR for the time being.
-
-;;
-;; Fill mm1 with heightmask
-;;
-		movd		mm1, edx				;; low dword = heightmask
-		punpckldq	mm1, mm1				;; copy low dword to high dword
-
-;;
-;; dest = ylookup[dc_yl] + columnofs[dc_x];
-;;
-		mov			eax, [dc_yl]
-		mov			edi, [ylookup+eax*4]
-		mov			ebx, [dc_x]
-		add			edi, [columnofs+ebx*4]	;; edi = dest
-
-
-;;
-;; pixelcount = yh - yl + 1
-;;
-		mov			ecx, [dc_yh]
-		add			ecx, 1
-		sub			ecx, eax				;; pixel count
-		jle			near .done				;; nothing to scale
-
-;;
-;; fracstep = dc_iscale;
-;;
-		movd		mm2, [dc_iscale]		;; fracstep in low dword
-		punpckldq	mm2, mm2				;; copy to high dword
-
-		mov			ebx, [dc_colormap]
-		mov			esi, [dc_source]
-
-;;
-;; frac = (dc_texturemid + FixedMul((dc_yl << FRACBITS) - centeryfrac, fracstep));
-;;
-											;; eax == dc_yl already
-		shl			eax, FRACBITS
-		sub			eax, [centeryfrac]
-		imul		dword [dc_iscale]
-		shrd		eax, edx, FRACBITS
-		add			eax, [dc_texturemid]
-
-;;
-;; if (dc_hires) frac = 0;
-;;
-		test		byte [dc_hires], 0x01
-		jz			.mod2
-		xor			eax, eax
-
-
-;;
-;; Do mod-2 pixel.
-;;
-.mod2:
-		test		ecx, 1
-		jz			.pairprepare
-		mov			edx, eax				;; edx = frac
-		add			eax, [dc_iscale]		;; eax += fracstep
-		sar			edx, FRACBITS
-		and			edx, ebp				;; edx &= heightmask
-		movzx		edx, byte [esi + edx]
-		movzx		edx, byte [ebx + edx]
-		mov			[edi], dl
-
-		add			edi, [vid + viddef_s.width]
-		sub			ecx, 1
-		jz			.done
-
-.pairprepare:
-;;
-;; Prepare for the main loop.
-;;
-		movd		mm3, eax				;; Low dword = frac
-		movq		mm4, mm3				;; Copy to intermediate register
-		paddd		mm4, mm2				;; dwords of mm4 += fracstep
-		punpckldq	mm3, mm4				;; Low dword = first frac, high = second
-		pslld		mm2, 1					;; fracstep *= 2
-
-;;
-;; ebp = vid.width
-;;
-		mov			ebp, [vid + viddef_s.width]
-
-		align		16
-.pairloop:
-		movq		mm0, mm3				;; 3B 1u.
-		psrad		mm0, FRACBITS			;; 4B 1u.
-		pand		mm0, mm1				;; 3B 1u. frac &= heightmask
-		paddd		mm3, mm2				;; 3B 1u. frac += fracstep
-
-		movd		eax, mm0				;; 3B 1u. Get first frac
-;; IFETCH boundary
-		movzx		eax, byte [esi + eax]	;; 4B 1u. Texture map
-		movzx		eax, byte [ebx + eax]	;; 4B 1u. Colormap
-
-		punpckhdq	mm0, mm0				;; 3B 1(2)u. low dword = high dword
-		movd		edx, mm0				;; 3B 1u. Get second frac
-		mov			[edi], al				;; 2B 1(2)u. First pixel
-;; IFETCH boundary
-
-		movzx		edx, byte [esi + edx]	;; 4B 1u. Texture map
-		movzx		edx, byte [ebx + edx]	;; 4B 1u. Colormap
-		mov			[edi + 1*ebp], dl		;; 3B 1(2)u. Second pixel
-
-		lea			edi, [edi + 2*ebp]		;; 3B 1u. edi += 2 * vid.width
-;; IFETCH boundary
-		sub			ecx, 2					;; 3B 1u. count -= 2
-		jnz			.pairloop				;; 2B 1u. if(count != 0) goto .pairloop
-
-
-.done:
-;;
-;; Clear MMX state, or else FPU operations will go badly awry.
-;;
-		emms
-
-		pop			ebx
-		pop			edi
-		pop			esi
-		pop			ebp
-		ret
-
-.usenonMMX:
-		call		R_DrawColumn_8_ASM
-		jmp			.done
-
-
-;;----------------------------------------------------------------------
-;;
-;; R_Draw2sMultiPatchColumn : Like R_DrawColumn, but omits transparent
-;;                            pixels.
-;;
-;; MMX column drawer.
-;;
-;;----------------------------------------------------------------------
-;; eax = accumulator
-;; ebx = colormap
-;; ecx = count
-;; edx = accumulator
-;; esi = source
-;; edi = dest
-;; ebp = vid.width
-;; mm0 = accumulator
-;; mm1 = heightmask, twice
-;; mm2 = 2 * fracstep, twice
-;; mm3 = pair of consecutive fracs
-;;----------------------------------------------------------------------
-
-
-cglobal R_Draw2sMultiPatchColumn_8_MMX
-R_Draw2sMultiPatchColumn_8_MMX:
-		push		ebp						;; preserve caller's stack frame pointer
-		push		esi						;; preserve register variables
-		push		edi
-		push		ebx
-
-;;
-;; Our algorithm requires that the texture height be a power of two.
-;; If not, fall back to the non-MMX drawer.
-;;
-.texheightcheck:
-		mov			edx, [dc_texheight]
-		sub			edx, 1					;; edx = heightmask
-		test		edx, [dc_texheight]
-		jnz			near .usenonMMX
-
-		mov			ebp, edx				;; Keep a copy of heightmask in a
-											;; GPR for the time being.
-
-;;
-;; Fill mm1 with heightmask
-;;
-		movd		mm1, edx				;; low dword = heightmask
-		punpckldq	mm1, mm1				;; copy low dword to high dword
-
-;;
-;; dest = ylookup[dc_yl] + columnofs[dc_x];
-;;
-		mov			eax, [dc_yl]
-		mov			edi, [ylookup+eax*4]
-		mov			ebx, [dc_x]
-		add			edi, [columnofs+ebx*4]	;; edi = dest
-
-
-;;
-;; pixelcount = yh - yl + 1
-;;
-		mov			ecx, [dc_yh]
-		add			ecx, 1
-		sub			ecx, eax				;; pixel count
-		jle			near .done				;; nothing to scale
-;;
-;; fracstep = dc_iscale;
-;;
-		movd		mm2, [dc_iscale]		;; fracstep in low dword
-		punpckldq	mm2, mm2				;; copy to high dword
-
-		mov			ebx, [dc_colormap]
-		mov			esi, [dc_source]
-
-;;
-;; frac = (dc_texturemid + FixedMul((dc_yl << FRACBITS) - centeryfrac, fracstep));
-;;
-											;; eax == dc_yl already
-		shl			eax, FRACBITS
-		sub			eax, [centeryfrac]
-		imul		dword [dc_iscale]
-		shrd		eax, edx, FRACBITS
-		add			eax, [dc_texturemid]
-
-;;
-;; if (dc_hires) frac = 0;
-;;
-		test		byte [dc_hires], 0x01
-		jz			.mod2
-		xor			eax, eax
-
-
-;;
-;; Do mod-2 pixel.
-;;
-.mod2:
-		test		ecx, 1
-		jz			.pairprepare
-		mov			edx, eax				;; edx = frac
-		add			eax, [dc_iscale]		;; eax += fracstep
-		sar			edx, FRACBITS
-		and			edx, ebp				;; edx &= heightmask
-		movzx		edx, byte [esi + edx]
-		cmp			dl, TRANSPARENTPIXEL
-		je			.nextmod2
-		movzx		edx, byte [ebx + edx]
-		mov			[edi], dl
-
-.nextmod2:
-		add			edi, [vid + viddef_s.width]
-		sub			ecx, 1
-		jz			.done
-
-.pairprepare:
-;;
-;; Prepare for the main loop.
-;;
-		movd		mm3, eax				;; Low dword = frac
-		movq		mm4, mm3				;; Copy to intermediate register
-		paddd		mm4, mm2				;; dwords of mm4 += fracstep
-		punpckldq	mm3, mm4				;; Low dword = first frac, high = second
-		pslld		mm2, 1					;; fracstep *= 2
-
-;;
-;; ebp = vid.width
-;;
-		mov			ebp, [vid + viddef_s.width]
-
-		align		16
-.pairloop:
-		movq		mm0, mm3				;; 3B 1u.
-		psrad		mm0, FRACBITS			;; 4B 1u.
-		pand		mm0, mm1				;; 3B 1u. frac &= heightmask
-		paddd		mm3, mm2				;; 3B 1u. frac += fracstep
-
-		movd		eax, mm0				;; 3B 1u. Get first frac
-;; IFETCH boundary
-		movzx		eax, byte [esi + eax]	;; 4B 1u. Texture map
-		punpckhdq	mm0, mm0				;; 3B 1(2)u. low dword = high dword
-		movd		edx, mm0				;; 3B 1u. Get second frac
-		cmp			al, TRANSPARENTPIXEL	;; 2B 1u.
-		je			.secondinpair			;; 2B 1u.
-;; IFETCH boundary
-		movzx		eax, byte [ebx + eax]	;; 4B 1u. Colormap
-		mov			[edi], al				;; 2B 1(2)u. First pixel
-
-.secondinpair:
-		movzx		edx, byte [esi + edx]	;; 4B 1u. Texture map
-		cmp			dl, TRANSPARENTPIXEL	;; 2B 1u.
-		je			.nextpair				;; 2B 1u.
-;; IFETCH boundary
-		movzx		edx, byte [ebx + edx]	;; 4B 1u. Colormap
-		mov			[edi + 1*ebp], dl		;; 3B 1(2)u. Second pixel
-
-.nextpair:
-		lea			edi, [edi + 2*ebp]		;; 3B 1u. edi += 2 * vid.width
-		sub			ecx, 2					;; 3B 1u. count -= 2
-		jnz			.pairloop				;; 2B 1u. if(count != 0) goto .pairloop
-
-
-.done:
-;;
-;; Clear MMX state, or else FPU operations will go badly awry.
-;;
-		emms
-
-		pop			ebx
-		pop			edi
-		pop			esi
-		pop			ebp
-		ret
-
-.usenonMMX:
-		call		R_Draw2sMultiPatchColumn_8_ASM
-		jmp			.done
-
-
-;;----------------------------------------------------------------------
-;;
-;; R_DrawSpan : 8bpp span drawer
-;;
-;; MMX span drawer.
-;;
-;;----------------------------------------------------------------------
-;; eax = accumulator
-;; ebx = colormap
-;; ecx = count
-;; edx = accumulator
-;; esi = source
-;; edi = dest
-;; ebp = two pixels
-;; mm0 = accumulator
-;; mm1 = xposition
-;; mm2 = yposition
-;; mm3 = 2 * xstep
-;; mm4 = 2 * ystep
-;; mm5 = nflatxshift
-;; mm6 = nflatyshift
-;; mm7 = accumulator
-;;----------------------------------------------------------------------
-
-cglobal R_DrawSpan_8_MMX
-R_DrawSpan_8_MMX:
-		push		ebp						;; preserve caller's stack frame pointer
-		push		esi						;; preserve register variables
-		push		edi
-		push		ebx
-
-;;
-;; esi = ds_source
-;; ebx = ds_colormap
-;;
-		mov			esi, [ds_source]
-		mov			ebx, [ds_colormap]
-
-;;
-;; edi = ylookup[ds_y] + columnofs[ds_x1]
-;;
-		mov			eax, [ds_y]
-		mov			edi, [ylookup + eax*4]
-		mov			edx, [ds_x1]
-		add			edi, [columnofs + edx*4]
-
-;;
-;; ecx = ds_x2 - ds_x1 + 1
-;;
-		mov			ecx, [ds_x2]
-		sub			ecx, edx
-		add			ecx, 1
-
-;;
-;; Needed for fracs and steps
-;;
-		movd		mm7, [nflatshiftup]
-
-;;
-;; mm3 = xstep
-;;
-		movd		mm3, [ds_xstep]
-		pslld		mm3, mm7
-		punpckldq	mm3, mm3
-
-;;
-;; mm4 = ystep
-;;
-		movd		mm4, [ds_ystep]
-		pslld		mm4, mm7
-		punpckldq	mm4, mm4
-
-;;
-;; mm1 = pair of consecutive xpositions
-;;
-		movd		mm1, [ds_xfrac]
-		pslld		mm1, mm7
-		movq		mm6, mm1
-		paddd		mm6, mm3
-		punpckldq	mm1, mm6
-
-;;
-;; mm2 = pair of consecutive ypositions
-;;
-		movd		mm2, [ds_yfrac]
-		pslld		mm2, mm7
-		movq		mm6, mm2
-		paddd		mm6, mm4
-		punpckldq	mm2, mm6
-
-;;
-;; mm5 = nflatxshift
-;; mm6 = nflatyshift
-;;
-		movd		mm5, [nflatxshift]
-		movd		mm6, [nflatyshift]
-
-;;
-;; Mask is in memory due to lack of registers.
-;;
-		mov			eax, [nflatmask]
-		mov			[nflatmask64], eax
-		mov			[nflatmask64 + 4], eax
-
-
-;;
-;; Go until we reach a dword boundary.
-;;
-.unaligned:
-		test		edi, 3
-		jz			.alignedprep
-.stragglers:
-		cmp			ecx, 0
-		je			.done					;; If ecx == 0, we're finished.
-
-;;
-;; eax = ((yposition >> nflatyshift) & nflatmask) | (xposition >> nflatxshift)
-;;
-		movq		mm0, mm1				;; mm0 = xposition
-		movq		mm7, mm2				;; mm7 = yposition
-		paddd		mm1, mm3				;; xposition += xstep (once!)
-		paddd		mm2, mm4				;; yposition += ystep (once!)
-		psrld		mm0, mm5				;; shift
-		psrld		mm7, mm6				;; shift
-		pand		mm7, [nflatmask64]		;; mask
-		por			mm0, mm7				;; or x and y together
-
-		movd		eax, mm0				;; eax = index of first pixel
-		movzx		eax, byte [esi + eax]	;; al = source[eax]
-		movzx		eax, byte [ebx + eax]	;; al = colormap[al]
-
-		mov			[edi], al
-		add			edi, 1
-
-		sub			ecx, 1
-		jmp			.unaligned
-
-
-.alignedprep:
-;;
-;; We can double the steps now.
-;;
-		pslld		mm3, 1
-		pslld		mm4, 1
-
-
-;;
-;; Generate chunks of four pixels.
-;;
-.alignedloop:
-
-;;
-;; Make sure we have at least four pixels.
-;;
-		cmp			ecx, 4
-		jl			.prestragglers
-
-;;
-;; First two pixels.
-;;
-		movq		mm0, mm1				;; mm0 = xposition
-		movq		mm7, mm2				;; mm7 = yposition
-		paddd		mm1, mm3				;; xposition += xstep
-		paddd		mm2, mm4				;; yposition += ystep
-		psrld		mm0, mm5				;; shift
-		psrld		mm7, mm6				;; shift
-		pand		mm7, [nflatmask64]		;; mask
-		por			mm0, mm7				;; or x and y together
-
-		movd		eax, mm0				;; eax = index of first pixel
-		movzx		eax, byte [esi + eax]	;; al = source[eax]
-		movzx		ebp, byte [ebx + eax]	;; ebp = colormap[al]
-
-		punpckhdq	mm0, mm0				;; both dwords = high dword
-		movd		eax, mm0				;; eax = index of second pixel
-		movzx		eax, byte [esi + eax]	;; al = source[eax]
-		movzx		eax, byte [ebx + eax]	;; al = colormap[al]
-		shl			eax, 8					;; get pixel in right byte
-		or			ebp, eax				;; put pixel in ebp
-
-;;
-;; Next two pixels.
-;;
-		movq		mm0, mm1				;; mm0 = xposition
-		movq		mm7, mm2				;; mm7 = yposition
-		paddd		mm1, mm3				;; xposition += xstep
-		paddd		mm2, mm4				;; yposition += ystep
-		psrld		mm0, mm5				;; shift
-		psrld		mm7, mm6				;; shift
-		pand		mm7, [nflatmask64]		;; mask
-		por			mm0, mm7				;; or x and y together
-
-		movd		eax, mm0				;; eax = index of third pixel
-		movzx		eax, byte [esi + eax]	;; al = source[eax]
-		movzx		eax, byte [ebx + eax]	;; al = colormap[al]
-		shl			eax, 16					;; get pixel in right byte
-		or			ebp, eax				;; put pixel in ebp
-
-		punpckhdq	mm0, mm0				;; both dwords = high dword
-		movd		eax, mm0				;; eax = index of second pixel
-		movzx		eax, byte [esi + eax]	;; al = source[eax]
-		movzx		eax, byte [ebx + eax]	;; al = colormap[al]
-		shl			eax, 24					;; get pixel in right byte
-		or			ebp, eax				;; put pixel in ebp
-
-;;
-;; Write pixels.
-;;
-		mov			[edi], ebp
-		add			edi, 4
-
-		sub			ecx, 4
-		jmp			.alignedloop
-
-.prestragglers:
-;;
-;; Back to one step at a time.
-;;
-		psrad		mm3, 1
-		psrad		mm4, 1
-		jmp			.stragglers
-
-.done:
-;;
-;; Clear MMX state, or else FPU operations will go badly awry.
-;;
-		emms
-
-		pop			ebx
-		pop			edi
-		pop			esi
-		pop			ebp
-		ret
--- a/src/tmap_vc.nas
+++ b/src/tmap_vc.nas
-;; SONIC ROBO BLAST 2
-;;-----------------------------------------------------------------------------
-;; Copyright (C) 1998-2000 by DooM Legacy Team.
-;; Copyright (C) 1999-2021 by Sonic Team Junior.
-;;
-;; This program is free software distributed under the
-;; terms of the GNU General Public License, version 2.
-;; See the 'LICENSE' file for more details.
-;;-----------------------------------------------------------------------------
-;; FILE:
-;;      tmap_vc.nas
-;; DESCRIPTION:
-;;      Assembler optimised math code for Visual C++.
-
-
-[BITS 32]
-
-%macro cglobal 1
-%define %1 _%1
-[global %1]
-%endmacro
-
-[SECTION .text write]
-
-;----------------------------------------------------------------------------
-;fixed_t FixedMul (fixed_t a, fixed_t b)
-;----------------------------------------------------------------------------
-cglobal FixedMul
-;       align   16
-FixedMul:
-        mov     eax,[esp+4]
-        imul    dword [esp+8]
-        shrd    eax,edx,16
-        ret
-
-;----------------------------------------------------------------------------
-;fixed_t FixedDiv2 (fixed_t a, fixed_t b);
-;----------------------------------------------------------------------------
-cglobal FixedDiv2
-;       align   16
-FixedDiv2:
-        mov     eax,[esp+4]
-        mov     edx,eax                 ;; these two instructions allow the next
-        sar     edx,31                  ;; two to pair, on the Pentium processor.
-        shld    edx,eax,16
-        sal     eax,16
-        idiv    dword [esp+8]
-        ret
--- a/src/w_wad.c
+++ b/src/w_wad.c
@@ -82,6 +82,14 @@
 #define O_BINARY 0
 #endif

+#ifdef HAVE_THREADS
+static I_mutex wad_mutex;
+#  define Lock_state()    I_lock_mutex(&wad_mutex)
+#  define Unlock_state() I_unlock_mutex(wad_mutex)
+#else
+#  define Lock_state()
+#  define Unlock_state()
+#endif

 typedef struct
 {
@@ -1922,6 +1930,8 @@ void *W_CacheLumpNumPwad(UINT16 wad, UINT16 lump, INT32 tag)
 	if (!TestValidLump(wad,lump))
 		return NULL;

+	Lock_state();
+
 	lumpcache = wadfiles[wad]->lumpcache;
 	if (!lumpcache[lump])
 	{
@@ -1931,6 +1941,8 @@ void *W_CacheLumpNumPwad(UINT16 wad, UINT16 lump, INT32 tag)
 	else
 		Z_ChangeTag(lumpcache[lump], tag);

+	Unlock_state();
+
 	return lumpcache[lump];
 }

@@ -1955,9 +1967,13 @@ void *W_CacheLumpNumForce(lumpnum_t lumpnum, INT32 tag)
 	if (!TestValidLump(wad,lump))
 		return NULL;

+	Lock_state();
+
 	ptr = Z_Malloc(W_LumpLengthPwad(wad, lump), tag, NULL);
 	W_ReadLumpHeaderPwad(wad, lump, ptr, 0, 0);  // read the lump in full

+	Unlock_state();
+
 	return ptr;
 }

@@ -1975,15 +1991,23 @@ static inline boolean W_IsLumpCachedPWAD(UINT16 wad, UINT16 lump, void *ptr)
 	if (!TestValidLump(wad, lump))
 		return false;

+	Lock_state();
+
 	lcache = wadfiles[wad]->lumpcache[lump];

 	if (ptr)
 	{
-		if (ptr == lcache)
+		if (ptr == lcache) {
+			Unlock_state();
 			return true;
+		}
 	}
-	else if (lcache)
+	else if (lcache) {
+		Unlock_state();
 		return true;
+	}
+
+	Unlock_state();

 	return false;
 }
@@ -2007,15 +2031,23 @@ static inline boolean W_IsPatchCachedPWAD(UINT16 wad, UINT16 lump, void *ptr)
 	if (!TestValidLump(wad, lump))
 		return false;

+	Lock_state();
+
 	lcache = wadfiles[wad]->patchcache[lump];

 	if (ptr)
 	{
-		if (ptr == lcache)
+		if (ptr == lcache) {
+			Unlock_state();
 			return true;
+		}
 	}
-	else if (lcache)
+	else if (lcache) {
+		Unlock_state();
 		return true;
+	}
+
+	Unlock_state();

 	return false;
 }
@@ -2048,7 +2080,7 @@ void *W_CacheLumpName(const char *name, INT32 tag)
 // Cache a patch into heap memory, convert the patch format as necessary
 //

-void *W_CacheSoftwarePatchNumPwad(UINT16 wad, UINT16 lump, INT32 tag)
+static void *W_CacheSoftwarePatch(UINT16 wad, UINT16 lump, INT32 tag)
 {
 	lumpcache_t *lumpcache = NULL;

@@ -2082,11 +2114,6 @@ void *W_CacheSoftwarePatchNumPwad(UINT16 wad, UINT16 lump, INT32 tag)
 	return lumpcache[lump];
 }

-void *W_CacheSoftwarePatchNum(lumpnum_t lumpnum, INT32 tag)
-{
-	return W_CacheSoftwarePatchNumPwad(WADFILENUM(lumpnum),LUMPNUM(lumpnum),tag);
-}
-
 void *W_CachePatchNumPwad(UINT16 wad, UINT16 lump, INT32 tag)
 {
 	patch_t *patch;
@@ -2094,16 +2121,23 @@ void *W_CachePatchNumPwad(UINT16 wad, UINT16 lump, INT32 tag)
 	if (!TestValidLump(wad, lump))
 		return NULL;

-	patch = W_CacheSoftwarePatchNumPwad(wad, lump, tag);
+	Lock_state();
+
+	patch = W_CacheSoftwarePatch(wad, lump, tag);

 #ifdef HWRENDER
 	// Software-only compile cache the data without conversion
 	if (rendermode == render_soft || rendermode == render_none)
 #endif
+	{
+		Unlock_state();
 		return (void *)patch;
+	}

 #ifdef HWRENDER
 	Patch_CreateGL(patch);
+	Unlock_state();
+
 	return (void *)patch;
 #endif
 }
@@ -2118,6 +2152,8 @@ void W_UnlockCachedPatch(void *patch)
 	if (!patch)
 		return;

+	Lock_state();
+
 	// The hardware code does its own memory management, as its patches
 	// have different lifetimes from software's.
 #ifdef HWRENDER
@@ -2126,6 +2162,8 @@ void W_UnlockCachedPatch(void *patch)
 	else
 #endif
 		Z_Unlock(patch);
+
+	Unlock_state();
 }

 void *W_CachePatchName(const char *name, INT32 tag)

--- a/src/w_wad.h
+++ b/src/w_wad.h
@@ -207,11 +207,6 @@ void *W_CachePatchLongName(const char *name, INT32 tag);
 void *W_CachePatchNumPwad(UINT16 wad, UINT16 lump, INT32 tag);
 void *W_CachePatchNum(lumpnum_t lumpnum, INT32 tag);

-// Returns a Software patch.
-// Performs any necessary conversions from PNG images.
-void *W_CacheSoftwarePatchNumPwad(UINT16 wad, UINT16 lump, INT32 tag);
-void *W_CacheSoftwarePatchNum(lumpnum_t lumpnum, INT32 tag);
-
 void W_UnlockCachedPatch(void *patch);

 void W_VerifyFileMD5(UINT16 wadfilenum, const char *matchmd5);

--- a/src/z_zone.c
+++ b/src/z_zone.c
@@ -39,6 +39,15 @@
 #include "hardware/hw_main.h" // For hardware memory info
 #endif

+#ifdef HAVE_THREADS
+static I_mutex Z_mutex;
+#  define Lock_state()    I_lock_mutex(&Z_mutex)
+#  define Unlock_state() I_unlock_mutex(Z_mutex)
+#else
+#  define Lock_state()
+#  define Unlock_state()
+#endif
+
 #ifdef HAVE_VALGRIND
 #include "valgrind.h"
 static boolean Z_calloc = false;
@@ -203,6 +212,8 @@ void Z_Free(void *ptr)
 	if (ptr == NULL)
 		return;

+	Lock_state();
+
 #ifdef ZDEBUG2
 	CONS_Debug(DBG_MEMORY, "Z_Free %s:%d\n", file, line);
 #endif
@@ -237,6 +248,8 @@ void Z_Free(void *ptr)
 	block->prev->next = block->next;
 	block->next->prev = block->prev;
 	free(block);
+
+	Unlock_state();
 }

 /** malloc() that doesn't accept failure.
@@ -295,6 +308,8 @@ void *Z_MallocAlign(size_t size, INT32 tag, void *user, INT32 alignbits)
 	void *given;
 	size_t blocksize = extrabytes + sizeof *hdr + size;

+	Lock_state();
+
 #ifdef ZDEBUG2
 	CONS_Debug(DBG_MEMORY, "Z_Malloc %s:%d\n", file, line);
 #endif
@@ -359,6 +374,8 @@ void *Z_MallocAlign(size_t size, INT32 tag, void *user, INT32 alignbits)
 		I_Error("Z_Malloc: attempted to allocate purgable block "
 			"(size %s) with no user", sizeu1(size));

+	Unlock_state();
+
 	return given;
 }

@@ -381,14 +398,19 @@ void *Z_Calloc2(size_t size, INT32 tag, void *user, INT32 alignbits, const char
 void *Z_CallocAlign(size_t size, INT32 tag, void *user, INT32 alignbits)
 #endif
 {
+	void *mem;
+	Lock_state();
 #ifdef VALGRIND_MEMPOOL_ALLOC
 	Z_calloc = true;
 #endif
 #ifdef ZDEBUG
-	return memset(Z_Malloc2    (size, tag, user, alignbits, file, line), 0, size);
+	mem = Z_Malloc2    (size, tag, user, alignbits, file, line);
 #else
-	return memset(Z_MallocAlign(size, tag, user, alignbits            ), 0, size);
+	mem = Z_MallocAlign(size, tag, user, alignbits            );
 #endif
+	memset(mem, 0, size);
+	Unlock_state();
+	return mem;
 }

 /** The Z_ReallocAlign function.
No results found