home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Enigma Amiga Life 113
/
EnigmaAmiga113CD.iso
/
software
/
sviluppo
/
quake_src
/
d_scanppc.s
< prev
next >
Wrap
Text File
|
2000-06-17
|
50KB
|
1,534 lines
#
# Copyright (C) 1996-1997 Id Software, Inc.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##
## Quake for AMIGA
##
## d_scanPPC.s
##
## Define WOS for PowerOpen ABI, otherwise SVR4-ABI is used.
##
.set NOLR,1
.include "macrosPPC.i"
.macro rsreset
.set rscnt,0
.endm
.macro rs
.set \1,rscnt
.set rscnt,rscnt+\2
.endm
#
# external references
#
xrefv cacheblock
xrefv d_sdivzorigin
xrefv d_sdivzstepu
xrefv d_sdivzstepv
xrefv d_tdivzorigin
xrefv d_tdivzstepu
xrefv d_tdivzstepv
xrefv d_ziorigin
xrefv d_zistepu
xrefv d_zistepv
xrefv sadjust
xrefv tadjust
xrefv sdivz
xrefv tdivz
xrefv bbextents
xrefv bbextentt
xrefv d_viewbuffer
xrefv screenwidth
xrefv cachewidth
xrefv d_zwidth
xrefv d_pzbuffer
xrefv sintable
xrefa d_subdiv16
xrefa cl
xrefa intsintable
xrefa sintable
xrefa vid
xrefa scr_vrect
xrefa r_refdef
xrefv INT2DBL_0
xrefv c0
xrefv c2
xrefv c65536
xrefa _ReciprocTable
#
# defines
#
.set QDIV,1
.set NICE_DIV,1
.set PSPAN_NEXT,0xc
.set CL_TIME,0x23c
.set VID_BUFFER,0
.set VID_ROWBYTES,16
.set SCR_VRECT_X,0
.set SCR_VRECT_Y,4
.set SCR_VRECT_WIDTH,8
.set SCR_VRECT_HEIGHT,12
.set REFDEF_VRECT_X,0
.set REFDEF_VRECT_Y,4
.set REFDEF_VRECT_WIDTH,8
.set REFDEF_VRECT_HEIGHT,12
.set CVAR_VALUE,16
# MUST match the #define in d_iface.h!
.set CYCLE,128
.set AMP2,3
.set SPEED,20
###########################################################################
#
# void D_WarpScreen (void)
#
# water effect algorithm
#
###########################################################################
funcdef D_WarpScreen
rsreset
rs int2dbl_tmp1,8
rs dbl2int_tmp1,8
rs rowptr,1024*4
rs column,1280*4
init 0,rscnt+512,8,0
stmw r24,gb(r1)
la r4,local+rowptr(r1) #r4 -> rowptr[1024]
la r5,local+column(r1) #r5 -> column[1280]
lf f1,INT2DBL_0 #f1 = 0x4330000080000000
stfd f1,local+int2dbl_tmp1(r1) #int2dbl_tmp1 = 0x43300000...
lxa r6,vid #r6 -> vid
lxa r7,r_refdef #r7 -> r_refdef
lxa r8,scr_vrect #r8 -> scr_vrect
lwz r9,SCR_VRECT_WIDTH(r8) #r9 = scr_vrect.width
lwz r10,SCR_VRECT_HEIGHT(r8) #r10 = scr_vrect.height
lwz r11,REFDEF_VRECT_X(r7) #r11 = r_refdef.vrect.x
lwz r12,REFDEF_VRECT_Y(r7) #r12 = r_refdef.vrect.y
lwz r0,REFDEF_VRECT_WIDTH(r7)
int2dbl f2,r0,r0,local+int2dbl_tmp1,f1 #f2 = (float)r_refdef.vrect.width
lwz r0,REFDEF_VRECT_HEIGHT(r7)
int2dbl f3,r0,r0,local+int2dbl_tmp1,f1 #f3 = (float)r_refdef.vrect.height
lw r31,screenwidth #r31 = screenwidth
ls f4,cAMP2times2 #f4 = AMP2*2
# w = r_refdef.vrect.width;
# h = r_refdef.vrect.height;
#
# wratio = w / (float)scr_vrect.width;
# hratio = h / (float)scr_vrect.height;
fadds f5,f2,f4 #f5 = w + AMP2*2
fadds f6,f3,f4 #f6 = h + AMP2*2
int2dbl f0,r9,r0,local+int2dbl_tmp1,f1
fmuls f5,f5,f0 #* (float)scr_vrect.width
int2dbl f0,r10,r0,local+int2dbl_tmp1,f1
fmuls f6,f6,f0 #* (float)scr_vrect.height
fmuls f2,f2,f2 #w*w
fmuls f3,f3,f3 #h*h
fdivs f2,f2,f5 #f2 = wratio*w/(w+AMP2*2)
fdivs f3,f3,f6 #f3 = hratio*h/(h+AMP2*2)
mullw r12,r12,r31 #r12 = r_refdef.vrect.y*screenwidth
lw r29,d_viewbuffer
add r12,r12,r29 #r12 = d_viewbuffer + r12
lwz r29,VID_ROWBYTES(r6) #r29 = vid.rowbytes
addi r27,r9,AMP2*2
addi r28,r10,AMP2*2
# for (v=0 ; v<scr_vrect.height+AMP2*2 ; v++)
# {
# rowptr[v] = d_viewbuffer + (r_refdef.vrect.y * screenwidth) +
# (screenwidth * (int)((float)v * hratio * h / (h + AMP2 * 2)));
# }
li r26,0 #v = 0
subi r25,r4,4 #r25 -> rowptr[0] - 4
.wsloop:
int2dbl f7,r26,r0,local+int2dbl_tmp1,f1 #f7 = (float)v
fmuls f7,f7,f3 #(float)v*hratio*h/(h+AMP2*2)
fctiwz f0,f7
stfd f0,local+dbl2int_tmp1(r1)
lwz r24,local+dbl2int_tmp1+4(r1) #r24 = (int)f7
mullw r24,r24,r31
add r24,r24,r12
addi r26,r26,1 #v++
stwu r24,4(r25) #rowptr[v] = r24
cmpw r26,r28
blt .wsloop
# for (u=0 # u<scr_vrect.width+AMP2*2 ; u++)
# {
# column[u] = r_refdef.vrect.x +
# (int)((float)u * wratio * w / (w + AMP2 * 2));
# }
li r26,0 #u = 0
subi r25,r5,4 #r25 -> column[0]
.wsloop2:
int2dbl f7,r26,r0,local+int2dbl_tmp1,f1 #f7 = (float)u
fmuls f7,f7,f2 #(float)u*wratio*w/(w+AMP2*2)
fctiwz f0,f7
stfd f0,local+dbl2int_tmp1(r1)
lwz r24,local+dbl2int_tmp1+4(r1) #r24 = (int)f7
add r24,r24,r11
addi r26,r26,1 #u++
stwu r24,4(r25) #column[u] = r24
cmpw r26,r27
blt .wsloop2
# turb = intsintable + ((int)(cl.time*SPEED)&(CYCLE-1));
# dest = vid.buffer + scr_vrect.y * vid.rowbytes + scr_vrect.x;
# for (v=0 ; v<scr_vrect.height ; v++, dest += vid.rowbytes)
# {
# col = &column[turb[v]];
# row = &rowptr[v];
# for (u=0 ; u<scr_vrect.width ; u+=4)
# {
# dest[u+0] = row[turb[u+0]][col[u+0]];
# dest[u+1] = row[turb[u+1]][col[u+1]];
# dest[u+2] = row[turb[u+2]][col[u+2]];
# dest[u+3] = row[turb[u+3]][col[u+3]];
# }
# }
srawi r9,r9,2
lxa r28,cl
lfd f7,CL_TIME(r28)
ls f0,cSPEED
fmuls f7,f7,f0
fctiwz f0,f7
stfd f0,local+dbl2int_tmp1(r1)
lwz r27,local+dbl2int_tmp1+4(r1)
andi. r27,r27,CYCLE-1
slwi r27,r27,2
lxa r26,intsintable
add r27,r27,r26 #r27 = turb = sintable + ...
lwz r28,SCR_VRECT_Y(r8)
mullw r28,r28,r29
lwz r26,SCR_VRECT_X(r8)
add r26,r26,r28
lwz r28,VID_BUFFER(r6)
add r26,r26,r28 #r26 = dest = vid.buffer + ...
li r6,0
.wsloop3:
slwi r0,r6,2
mtctr r9
subi r7,r27,4 #r7 -> turb[u] - 4
lwzx r8,r27,r0 #r8 = turb[v]
subi r11,r26,1 #r11 -> dest[u] - 1
slwi r8,r8,2
add r12,r5,r8 #r12 = col = &column[turb[v]]
subi r12,r12,4
add r31,r4,r0 #r31 = row = &rowptr[v]
.wsloop4:
lwzu r0,4(r7) #r0 = turb[u+0]
slwi r0,r0,2
lwzx r30,r31,r0 #r30 = row[turb[u+0]]
lwzu r0,4(r12) #r0 = col[u+0]
lbzx r0,r30,r0 #r0 = row[turb[u+0][col[u+0]]
stbu r0,1(r11) #dest[u+0] = r0
lwzu r0,4(r7) #r0 = turb[u+0]
slwi r0,r0,2
lwzx r30,r31,r0 #r30 = row[turb[u+0]]
lwzu r0,4(r12) #r0 = col[u+0]
lbzx r0,r30,r0 #r0 = row[turb[u+0][col[u+0]]
stbu r0,1(r11) #dest[u+0] = r0
lwzu r0,4(r7) #r0 = turb[u+0]
slwi r0,r0,2
lwzx r30,r31,r0 #r30 = row[turb[u+0]]
lwzu r0,4(r12) #r0 = col[u+0]
lbzx r0,r30,r0 #r0 = row[turb[u+0][col[u+0]]
stbu r0,1(r11) #dest[u+0] = r0
lwzu r0,4(r7) #r0 = turb[u+0]
slwi r0,r0,2
lwzx r30,r31,r0 #r30 = row[turb[u+0]]
lwzu r0,4(r12) #r0 = col[u+0]
lbzx r0,r30,r0 #r0 = row[turb[u+0][col[u+0]]
stbu r0,1(r11) #dest[u+0] = r0
bdnz .wsloop4
add r26,r26,r29 #dest += vid.rowbytes
addi r6,r6,1
cmpw r6,r10
blt .wsloop3
lmw r24,gb(r1)
exit
funcend D_WarpScreen
###########################################################################
#
# void Turbulent8 (espan_t *pspan)
#
# standard scan drawing function for animated textures
# Note: The function D_DrawTurbulent8Span was inlined into this
# function, because it's never used anywhere else.
#
###########################################################################
funcdef Turbulent8
init 0,16,7,16 # 16 local bytes for int2dbl/dbl2int
stmw r25,gb(r1)
stfd f14,fb+0*8(r1)
stfd f15,fb+1*8(r1)
stfd f16,fb+2*8(r1)
stfd f17,fb+3*8(r1)
stfd f18,fb+4*8(r1)
stfd f19,fb+5*8(r1)
stfd f20,fb+6*8(r1)
stfd f21,fb+7*8(r1)
stfd f22,fb+8*8(r1)
stfd f23,fb+9*8(r1)
stfd f24,fb+10*8(r1)
stfd f25,fb+11*8(r1)
stfd f26,fb+12*8(r1)
stfd f27,fb+13*8(r1)
stfd f28,fb+14*8(r1)
stfd f29,fb+15*8(r1)
lxa r26,_ReciprocTable
lw r27,cacheblock #r27 = r_turb_pbase
lxa r5,cl
lfd f1,CL_TIME(r5)
ls f2,cSPEED
fmuls f1,f1,f2
fctiwz f0,f1
stfd f0,local+8(r1)
lwz r4,local+12(r1)
andi. r4,r4,CYCLE-1
slwi r4,r4,2
lxa r5,sintable
add r4,r4,r5 #r4 = r_turb_turb = sintable + ...
ls f1,d_sdivzstepu #f1 = d_sdivzstepu
ls f2,d_sdivzstepv #f2 = d_sdivzstepv
ls f3,d_sdivzorigin #f3 = d_sdivzorigin
ls f4,d_tdivzstepu #f4 = d_tdivzstepu
ls f5,d_tdivzstepv #f5 = d_tdivzstepv
ls f6,d_tdivzorigin #f6 = d_tdivzorigin
ls f7,d_zistepu #f7 = d_zistepu
ls f8,d_zistepv #f8 = d_zistepv
ls f9,d_ziorigin #f9 = d_ziorigin
lf f10,INT2DBL_0 #for int2dbl_setup
stfd f10,local(r1)
lw r6,sadjust
int2dbl f11,r6,r0,local,f10 #f11 = sadjust
lw r6,tadjust
int2dbl f12,r6,r0,local,f10 #f12 = tadjust
lw r6,bbextents
int2dbl f13,r6,r0,local,f10 #f13 = bbextents
lw r6,bbextentt
int2dbl f14,r6,r0,local,f10 #f14 = bbextentt
lw r6,d_viewbuffer #r6 = d_viewbuffer
lw r7,screenwidth #r7 = screenwidth
ls f15,c65536 #f15 = 65536
ls f16,c16 #f16 = 16
ls f25,c0 #f25 = 0
ls f28,c2 #f28 = 2
fmuls f17,f1,f16 #f17 = sdivz16stepu
fmuls f18,f4,f16 #f18 = tdivz16stepu
fmuls f19,f7,f16 #f19 = zi16stepu
subi r3,r3,4 #prepare for postincrement
###### First loop. In every iteration one complete span is drawn
# pbase = (unsigned char *)cacheblock;
#
# sdivz16stepu = d_sdivzstepu * 16;
# tdivz16stepu = d_tdivzstepu * 16;
# zi16stepu = d_zistepu * 16;
#
# do
# {
# r_turb_pdest = (unsigned char *)((byte *)d_viewbuffer +
# (screenwidth * pspan->v) + pspan->u);
#
# count = pspan->count;
#
# // calculate the initial s/z, t/z, 1/z, s, and t and clamp
# du = (float)pspan->u;
# dv = (float)pspan->v;
#
# sdivz = d_sdivzorigin + dv*d_sdivzstepv + du*d_sdivzstepu;
# tdivz = d_tdivzorigin + dv*d_tdivzstepv + du*d_tdivzstepu;
# zi = d_ziorigin + dv*d_zistepv + du*d_zistepu;
# z = (float)0x10000 / zi; // prescale to 16.16 fixed-point
#
.t8loop:
lwzu r8,4(r3)
int2dbl f20,r8,r0,local,f10 #f20 = du
lwzu r9,4(r3)
int2dbl f21,r9,r0,local,f10 #f21 = dv
fmadds f24,f21,f8,f9
fmadds f22,f21,f2,f3
fmadds f23,f21,f5,f6
fmadds f26,f20,f7,f24 #f26 = zi
fmadds f22,f20,f1,f22 #f22 = sdivz
fmadds f23,f20,f4,f23 #f23 = tdivz
fmuls f0,f26,f26
frsqrte f0,f0
mullw r10,r9,r7 #r10 = pspan->v * screenwidth
fnmsubs f29,f0,f26,f28
add r10,r10,r8 #r10 = r10 + pspan->u
fmuls f0,f0,f29
add r11,r6,r10 #r11 = pdest
fnmsubs f29,f0,f26,f28
subi r11,r11,1 #prepare for postincrement
fmuls f0,f0,f29
lwzu r12,4(r3) #r12 = count
fmuls f24,f0,f15
fmadds f20,f22,f24,f11 #f20 = s
fsubs f0,f20,f13
fsel f20,f0,f13,f20 #if (s>bb...) s = bbextents
fsel f20,f20,f20,f25 #if (s<0) s = 0
fctiwz f0,f20
stfd f0,local+8(r1)
lwz r8,local+12(r1) #r8 = (int)s
fmadds f21,f23,f24,f12 #f21 = t
fsubs f0,f21,f14
fsel f21,f0,f14,f21 #if (t>bb...) t = bbextentt
fsel f21,f21,f21,f25 #if (t<0) t = 0
fctiwz f0,f21
stfd f0,local+8(r1)
lwz r9,local+12(r1) #r9 = (int)t
###### Second loop. In every iteration one part of the whole span is drawn
# do
# {
# // calculate s and t at the far end of the span
# if (count >= 16)
# r_turb_spancount = 16;
# else
# r_turb_spancount = count;
#
# count -= r_turb_spancount;
#
# if (count)
# {
.t8loop2:
cmpwi r12,16
bgt .t8cont
mtctr r12
subi r10,r12,1
int2dbl f21,r10,r0,local,f10 #spancountminus1 = (float)...
li r12,0 #r12 = count -= spancount
fmadds f26,f21,f7,f26 #zi += d_zistepu * ...
b .t8finalpart
.t8cont:
fadds f26,f26,f19 #zi += zi16stepu
li r10,16 #r10 = spancount = 16
fmuls f0,f26,f26
subf r12,r10,r12 #r12 = count -= spancount
frsqrte f0,f0
mtctr r10
###### Evaluation of the values for the inner loop. This version is used for
###### span size = 16
# // calculate s/z, t/z, zi->fixed s and t at far end of span,
# // calculate s and t steps across span by shifting
# sdivz += sdivz16stepu;
# tdivz += tdivz16stepu;
# zi += zi16stepu;
# z = (float)0x10000 / zi; // prescale to 16.16 fixed-point
# snext = (int)(sdivz * z) + sadjust;
# if (snext > bbextents)
# snext = bbextents;
# else if (snext < 16)
# snext = 16; // prevent round-off error on <0 steps from
# // from causing overstepping & running off the
# // edge of the texture
# tnext = (int)(tdivz * z) + tadjust;
# if (tnext > bbextentt)
# tnext = bbextentt;
# else if (tnext < 16)
# tnext = 16; // guard against round-off error on <0 steps
# r_turb_sstep = (snext - r_turb_s) >> 4;
# r_turb_tstep = (tnext - r_turb_t) >> 4;
# }
fnmsubs f20,f0,f26,f28
fadds f22,f22,f17 #sdivz += sdivz16stepu
fmuls f0,f0,f20
fadds f23,f23,f18 #tdivz += tdivz16stepu
fnmsubs f20,f0,f26,f28
fmuls f0,f0,f20
fmuls f24,f0,f15
fmadds f20,f22,f24,f11 #f20 = snext
fsubs f0,f20,f13
fsel f20,f0,f13,f20 #if (snext>bb...) snext = bbextents
fsubs f0,f20,f16
fsel f20,f0,f20,f16 #if (snext<16) snext = 16
fctiwz f0,f20
stfd f0,local+8(r1)
lwz r31,local+12(r1) #r31 = (int)snext
fmadds f21,f23,f24,f12 #f21 = tnext
fsubs f0,f21,f14
fsel f21,f0,f14,f21 #if (tnext>bb...) tnext = bbextentt
subf r29,r8,r31
fsubs f0,f21,f16
srawi r29,r29,4 #r29 = sstep = (snext - s) >> 4
fsel f21,f0,f21,f16 #if (tnext<16) tnext = 16
fctiwz f0,f21
stfd f0,local+8(r1)
lwz r30,local+12(r1) #r30 = (int)tnext
subf r28,r9,r30
srawi r28,r28,4 #r28 = tstep = (tnext - t) >> 4
b .t8mainloop
###### Evaluation of the values for the inner loop. This version is used for
###### span size < 16
###### The original algorithm has two ugly divisions at the end of this part.
###### These are removed by the following optimization:
###### First, the divisors 1,2 and 4 are handled specially to gain speed. The
###### other divisors are handled using a reciprocal table.
# // calculate s/z, t/z, zi->fixed s and t at last pixel in span (so
# // can't step off polygon), clamp, calculate s and t steps across
# // span by division, biasing steps low so we don't run off the
# // texture
# spancountminus1 = (float)(r_turb_spancount - 1);
# sdivz += d_sdivzstepu * spancountminus1;
# tdivz += d_tdivzstepu * spancountminus1;
# zi += d_zistepu * spancountminus1;
# z = (float)0x10000 / zi; // prescale to 16.16 fixed-point
# snext = (int)(sdivz * z) + sadjust;
# if (snext > bbextents)
# snext = bbextents;
# else if (snext < 16)
# snext = 16; // prevent round-off error on <0 steps from
# // from causing overstepping & running off the
# // edge of the texture
#
# tnext = (int)(tdivz * z) + tadjust;
# if (tnext > bbextentt)
# tnext = bbextentt;
# else if (tnext < 16)
# tnext = 16; // guard against round-off error on <0 steps
#
# if (r_turb_spancount > 1)
# {
# r_turb_sstep = (snext - r_turb_s) / (spancount - 1);
# r_turb_tstep = (tnext - r_turb_t) / (spancount - 1);
# }
# }
.t8finalpart:
fmuls f0,f26,f26
fmadds f22,f21,f1,f22 #sdivz += d_sdivzstepu * ...
frsqrte f0,f0
fmadds f23,f21,f4,f23 #tdivs += d_tdivzstepu * ...
fnmsubs f20,f0,f26,f28
cmplwi r10,5
fmuls f0,f0,f20
fnmsubs f20,f0,f26,f28
fmuls f0,f0,f20
fmuls f24,f0,f15
fmadds f20,f22,f24,f11 #f27 = snext
fsubs f0,f20,f13
fsel f20,f0,f13,f20 #if (snext>bb...) snext = bbextents
fsubs f0,f20,f16
fsel f20,f0,f20,f16 #if (snext<16) snext = 16
fctiwz f0,f20
stfd f0,local+8(r1)
lwz r31,local+12(r1) #r31 = (int)snext
fmadds f21,f23,f24,f12 #f21 = tnext
fsubs f0,f21,f14
fsel f21,f0,f14,f21 #if (tnext>bb...) tnext = bbextentt
subf r29,r8,r31
fsubs f0,f21,f16
fsel f21,f0,f21,f16 #if (tnext<16) tnext = 16
fctiwz f0,f21
stfd f0,local+8(r1)
lwz r30,local+12(r1) #r30 = (int)tnext
subf r28,r9,r30
blt .t8special
.t8qdiv:
slwi r0,r10,2
lwzx r0,r26,r0
mulhw r29,r29,r0
mulhw r28,r28,r0
b .t8mainloop
.t8special:
cmplwi r10,1
ble .t8mainloop
cmplwi r10,3
beq .t8qdiv
blt .t8spec_2
srawi r29,r29,2
srawi r28,r28,2
b .t8mainloop
.t8spec_2:
srawi r29,r29,1
srawi r28,r28,1
###### D_DrawTurbulent8Span (inlined)
###### Main drawing loop.
# do
# {
# sturb = ((r_turb_s + r_turb_turb[(r_turb_t>>16)&(CYCLE-1)])>>16)&63;
# tturb = ((r_turb_t + r_turb_turb[(r_turb_s>>16)&(CYCLE-1)])>>16)&63;
# *r_turb_pdest++ = *(r_turb_pbase + (tturb<<6) + sturb);
# r_turb_s += r_turb_sstep;
# r_turb_t += r_turb_tstep;
# } while (--r_turb_spancount > 0);
.t8mainloop:
andis. r8,r8,CYCLE-1
andis. r9,r9,CYCLE-1
.t8draw:
rlwinm r0,r9,18,23,29 #implicit: CYCLE = 128
rlwinm r25,r8,18,23,29 #implicit: CYCLE = 128
lwzx r10,r4,r0
add r10,r10,r8
lwzx r25,r4,r25
add r8,r8,r29
add r25,r25,r9
extrwi r10,r10,6,10
rlwinm r25,r25,22,20,25
add r10,r10,r25
add r9,r9,r28
lbzx r0,r27,r10
stbu r0,1(r11)
bdnz .t8draw
mr r8,r31
mr r9,r30
mr. r12,r12
bgt .t8loop2
lwz r3,4(r3) #while (...)
mr. r3,r3
subi r3,r3,4
bne .t8loop
lmw r25,gb(r1)
lfd f14,fb+0*8(r1)
lfd f15,fb+1*8(r1)
lfd f16,fb+2*8(r1)
lfd f17,fb+3*8(r1)
lfd f18,fb+4*8(r1)
lfd f19,fb+5*8(r1)
lfd f20,fb+6*8(r1)
lfd f21,fb+7*8(r1)
lfd f22,fb+8*8(r1)
lfd f23,fb+9*8(r1)
lfd f24,fb+10*8(r1)
lfd f25,fb+11*8(r1)
lfd f26,fb+12*8(r1)
lfd f27,fb+13*8(r1)
lfd f28,fb+14*8(r1)
lfd f29,fb+15*8(r1)
exit
funcend Turbulent8
###########################################################################
#
# void D_DrawSpans8 (espan_t *pspan)
#
# standard scan drawing function (8 pixel subdivision)
#
###########################################################################
funcdef D_DrawSpans8
lxa r4,d_subdiv16 # subdiv16? then call DrawSpans16!
lfs f0,CVAR_VALUE(r4)
ls f1,c0
fcmpo 0,f0,f1
.ifdef WOS
bne _D_DrawSpans16
.else
bne D_DrawSpans16
.endif
init 0,16,6,16 # 16 local bytes for int2dbl/dbl2int
stmw r26,gb(r1)
stfd f14,fb+0*8(r1)
stfd f15,fb+1*8(r1)
stfd f16,fb+2*8(r1)
stfd f17,fb+3*8(r1)
stfd f18,fb+4*8(r1)
stfd f19,fb+5*8(r1)
stfd f20,fb+6*8(r1)
stfd f21,fb+7*8(r1)
stfd f22,fb+8*8(r1)
stfd f23,fb+9*8(r1)
stfd f24,fb+10*8(r1)
stfd f25,fb+11*8(r1)
stfd f26,fb+12*8(r1)
stfd f27,fb+13*8(r1)
stfd f28,fb+14*8(r1)
stfd f29,fb+15*8(r1)
lxa r26,_ReciprocTable
lw r27,cacheblock #r27 = pbase
lw r4,cachewidth #r4 = cachewidth
ls f1,d_sdivzstepu #f1 = d_sdivzstepu
ls f2,d_sdivzstepv #f2 = d_sdivzstepv
ls f3,d_sdivzorigin #f3 = d_sdivzorigin
ls f4,d_tdivzstepu #f4 = d_tdivzstepu
ls f5,d_tdivzstepv #f5 = d_tdivzstepv
ls f6,d_tdivzorigin #f6 = d_tdivzorigin
ls f7,d_zistepu #f7 = d_zistepu
ls f8,d_zistepv #f8 = d_zistepv
ls f9,d_ziorigin #f9 = d_ziorigin
lf f10,INT2DBL_0 #for int2dbl_setup
stfd f10,local(r1)
lw r6,sadjust
int2dbl f11,r6,r0,local,f10 #f11 = sadjust
lw r6,tadjust
int2dbl f12,r6,r0,local,f10 #f12 = tadjust
lw r6,bbextents
int2dbl f13,r6,r0,local,f10 #f13 = bbextents
lw r6,bbextentt
int2dbl f14,r6,r0,local,f10 #f14 = bbextentt
lw r6,d_viewbuffer #r6 = d_viewbuffer
lw r7,screenwidth #r7 = screenwidth
ls f15,c65536 #f15 = 65536
ls f16,c8 #f16 = 8
ls f25,c0 #f25 = 0
ls f28,c2 #f28 = 2
fmuls f17,f1,f16 #f17 = sdivz8stepu
fmuls f18,f4,f16 #f18 = tdivz8stepu
fmuls f19,f7,f16 #f19 = zi8stepu
subi r3,r3,4 #prepare for postincrement
###### First loop. In every iteration one complete span is drawn
# pbase = (unsigned char *)cacheblock;
#
# sdivz8stepu = d_sdivzstepu * 8;
# tdivz8stepu = d_tdivzstepu * 8;
# zi8stepu = d_zistepu * 8;
#
# do
# {
# pdest = (unsigned char *)((byte *)d_viewbuffer +
# (screenwidth * pspan->v) + pspan->u);
#
# count = pspan->count;
#
# // calculate the initial s/z, t/z, 1/z, s, and t and clamp
# du = (float)pspan->u;
# dv = (float)pspan->v;
#
# sdivz = d_sdivzorigin + dv*d_sdivzstepv + du*d_sdivzstepu;
# tdivz = d_tdivzorigin + dv*d_tdivzstepv + du*d_tdivzstepu;
# zi = d_ziorigin + dv*d_zistepv + du*d_zistepu;
# z = (float)0x10000 / zi; // prescale to 16.16 fixed-point
#
.d8loop:
lwzu r8,4(r3)
int2dbl f20,r8,r0,local,f10 #f20 = du
lwzu r9,4(r3)
int2dbl f21,r9,r0,local,f10 #f21 = dv
fmadds f24,f21,f8,f9
fmadds f22,f21,f2,f3
fmadds f23,f21,f5,f6
fmadds f26,f20,f7,f24 #f26 = zi
fmadds f22,f20,f1,f22 #f22 = sdivz
fmadds f23,f20,f4,f23 #f23 = tdivz
fmuls f0,f26,f26
frsqrte f0,f0
mullw r10,r9,r7 #r10 = pspan->v * screenwidth
fnmsubs f29,f0,f26,f28
add r10,r10,r8 #r10 = r10 + pspan->u
fmuls f0,f0,f29
add r11,r6,r10 #r11 = pdest
fnmsubs f29,f0,f26,f28
subi r11,r11,1 #prepare for postincrement
fmuls f0,f0,f29
lwzu r12,4(r3) #r12 = count
fmuls f24,f0,f15
fmadds f20,f22,f24,f11 #f20 = s
fsubs f0,f20,f13
fsel f20,f0,f13,f20 #if (s>bb...) s = bbextents
fsel f20,f20,f20,f25 #if (s<0) s = 0
fctiwz f0,f20
stfd f0,local+8(r1)
lwz r8,local+12(r1) #r8 = (int)s
fmadds f21,f23,f24,f12 #f21 = t
fsubs f0,f21,f14
fsel f21,f0,f14,f21 #if (t>bb...) t = bbextentt
fsel f21,f21,f21,f25 #if (t<0) t = 0
fctiwz f0,f21
stfd f0,local+8(r1)
lwz r9,local+12(r1) #r9 = (int)t
li r0,1
dcbtst r11,r0
###### Second loop. In every iteration one part of the whole span is drawn
# do
# {
# // calculate s and t at the far end of the span
# if (count >= 8)
# spancount = 8;
# else
# spancount = count;
#
# count -= spancount;
#
# if (count)
# {
.d8loop2:
cmpwi r12,8
bgt .d8cont
mtctr r12
subi r10,r12,1
int2dbl f21,r10,r0,local,f10 #spancountminus1 = (float)...
li r12,0 #r12 = count -= spancount
fmadds f26,f21,f7,f26 #zi += d_zistepu # ...
b .d8finalpart
.d8cont:
fadds f26,f26,f19 #zi += zi8stepu
li r10,8 #r10 = spancount = 8
fmuls f0,f26,f26
subf r12,r10,r12 #r12 = count -= spancount
frsqrte f0,f0
mtctr r10
###### Evaluation of the values for the inner loop. This version is used for
###### span size = 8
# // calculate s/z, t/z, zi->fixed s and t at far end of span,
# // calculate s and t steps across span by shifting
# sdivz += sdivz8stepu;
# tdivz += tdivz8stepu;
# zi += zi8stepu;
# z = (float)0x10000 / zi; // prescale to 16.16 fixed-point
# snext = (int)(sdivz * z) + sadjust;
# if (snext > bbextents)
# snext = bbextents;
# else if (snext < 8)
# snext = 8; // prevent round-off error on <0 steps from
# // from causing overstepping & running off the
# // edge of the texture
# tnext = (int)(tdivz * z) + tadjust;
# if (tnext > bbextentt)
# tnext = bbextentt;
# else if (tnext < 8)
# tnext = 8; // guard against round-off error on <0 steps
# sstep = (snext - s) >> 3;
# tstep = (tnext - t) >> 3;
# }
fnmsubs f20,f0,f26,f28
fadds f22,f22,f17 #sdivz += sdivz8stepu
fmuls f0,f0,f20
fadds f23,f23,f18 #tdivz += tdivz8stepu
fnmsubs f20,f0,f26,f28
fmuls f0,f0,f20
fmuls f24,f0,f15
fmadds f20,f22,f24,f11 #f20 = snext
fsubs f0,f20,f13
fsel f20,f0,f13,f20 #if (snext>bb...) snext = bbextents
fsubs f0,f20,f16
fsel f20,f0,f20,f16 #if (snext<8) snext = 8
fctiwz f0,f20
stfd f0,local+8(r1)
lwz r31,local+12(r1) #r31 = (int)snext
fmadds f21,f23,f24,f12 #f21 = tnext
fsubs f0,f21,f14
fsel f21,f0,f14,f21 #if (tnext>bb...) tnext = bbextentt
subf r29,r8,r31
fsubs f0,f21,f16
srawi r29,r29,3 #r29 = sstep = (snext - s) >> 3
fsel f21,f0,f21,f16 #if (tnext<8) tnext = 8
fctiwz f0,f21
stfd f0,local+8(r1)
lwz r30,local+12(r1) #r30 = (int)tnext
subf r28,r9,r30
srawi r28,r28,3 #r28 = tstep = (tnext - t) >> 3
b .d8mainloop
###### Evaluation of the values for the inner loop. This version is used for
###### span size < 8
###### The original algorithm has two ugly divisions at the end of this part.
###### These are removed by the following optimization:
###### First, the divisors 1,2 and 4 are handled specially to gain speed. The
###### other divisors are handled using a reciprocal table.
# // calculate s/z, t/z, zi->fixed s and t at last pixel in span (so
# // can't step off polygon), clamp, calculate s and t steps across
# // span by division, biasing steps low so we don't run off the
# // texture
# spancountminus1 = (float)(spancount - 1);
# sdivz += d_sdivzstepu * spancountminus1;
# tdivz += d_tdivzstepu * spancountminus1;
# zi += d_zistepu * spancountminus1;
# z = (float)0x10000 / zi; // prescale to 16.16 fixed-point
# snext = (int)(sdivz * z) + sadjust;
# if (snext > bbextents)
# snext = bbextents;
# else if (snext < 8)
# snext = 8; // prevent round-off error on <0 steps from
# // from causing overstepping & running off the
# // edge of the texture
#
# tnext = (int)(tdivz * z) + tadjust;
# if (tnext > bbextentt)
# tnext = bbextentt;
# else if (tnext < 8)
# tnext = 8; // guard against round-off error on <0 steps
#
# if (spancount > 1)
# {
# sstep = (snext - s) / (spancount - 1);
# tstep = (tnext - t) / (spancount - 1);
# }
# }
.d8finalpart:
fmuls f0,f26,f26
fmadds f22,f21,f1,f22 #sdivz += d_sdivzstepu * ...
frsqrte f0,f0
fmadds f23,f21,f4,f23 #tdivs += d_tdivzstepu * ...
fnmsubs f20,f0,f26,f28
cmplwi r10,5
fmuls f0,f0,f20
fnmsubs f20,f0,f26,f28
fmuls f0,f0,f20
fmuls f24,f0,f15
fmadds f20,f22,f24,f11 #f27 = snext
fsubs f0,f20,f13
fsel f20,f0,f13,f20 #if (snext>bb...) snext = bbextents
fsubs f0,f20,f16
fsel f20,f0,f20,f16 #if (snext<8) snext = 8
fctiwz f0,f20
stfd f0,local+8(r1)
lwz r31,local+12(r1) #r31 = (int)snext
fmadds f21,f23,f24,f12 #f21 = tnext
fsubs f0,f21,f14
fsel f21,f0,f14,f21 #if (tnext>bb...) tnext = bbextentt
subf r29,r8,r31
fsubs f0,f21,f16
fsel f21,f0,f21,f16 #if (tnext<8) tnext = 8
fctiwz f0,f21
stfd f0,local+8(r1)
lwz r30,local+12(r1) #r30 = (int)tnext
subf r28,r9,r30
blt .d8special
.d8qdiv:
slwi r0,r10,2
lwzx r0,r26,r0
mulhw r29,r29,r0
mulhw r28,r28,r0
b .d8mainloop
.d8special:
cmplwi r10,1
ble .d8mainloop
cmplwi r10,3
beq .d8qdiv
blt .d8spec_2
srawi r29,r29,2
srawi r28,r28,2
b .d8mainloop
.d8spec_2:
srawi r29,r29,1
srawi r28,r28,1
###### Main drawing loop. Here lies the speed.
# do
# {
# *pdest++ = *(pbase + (s >> 16) + (t >> 16) * cachewidth);
# s += sstep;
# t += tstep;
# } while (--spancount > 0);
.d8mainloop:
srawi r0,r9,16
srawi r10,r8,16
mullw r0,r0,r4
add r9,r9,r28
add r0,r0,r10
add r8,r8,r29
lbzx r0,r27,r0
stbu r0,1(r11)
bdnz .d8mainloop
mr r8,r31
mr r9,r30
mr. r12,r12
bgt .d8loop2
lwz r3,4(r3) #while (...)
mr. r3,r3
subi r3,r3,4
bne .d8loop
lmw r26,gb(r1)
lfd f14,fb+0*8(r1)
lfd f15,fb+1*8(r1)
lfd f16,fb+2*8(r1)
lfd f17,fb+3*8(r1)
lfd f18,fb+4*8(r1)
lfd f19,fb+5*8(r1)
lfd f20,fb+6*8(r1)
lfd f21,fb+7*8(r1)
lfd f22,fb+8*8(r1)
lfd f23,fb+9*8(r1)
lfd f24,fb+10*8(r1)
lfd f25,fb+11*8(r1)
lfd f26,fb+12*8(r1)
lfd f27,fb+13*8(r1)
lfd f28,fb+14*8(r1)
lfd f29,fb+15*8(r1)
exit
funcend D_DrawSpans8
###########################################################################
#
# void D_DrawSpans16 (espan_t *pspan)
#
# standard scan drawing function (16 pixel subdivision)
#
###########################################################################
funcdef D_DrawSpans16
init 0,16,6,16 # 16 local bytes for int2dbl/dbl2int
stmw r26,gb(r1)
stfd f14,fb+0*8(r1)
stfd f15,fb+1*8(r1)
stfd f16,fb+2*8(r1)
stfd f17,fb+3*8(r1)
stfd f18,fb+4*8(r1)
stfd f19,fb+5*8(r1)
stfd f20,fb+6*8(r1)
stfd f21,fb+7*8(r1)
stfd f22,fb+8*8(r1)
stfd f23,fb+9*8(r1)
stfd f24,fb+10*8(r1)
stfd f25,fb+11*8(r1)
stfd f26,fb+12*8(r1)
stfd f27,fb+13*8(r1)
stfd f28,fb+14*8(r1)
stfd f29,fb+15*8(r1)
lxa r26,_ReciprocTable
lw r27,cacheblock #r27 = pbase
lw r4,cachewidth #r4 = cachewidth
ls f1,d_sdivzstepu #f1 = d_sdivzstepu
ls f2,d_sdivzstepv #f2 = d_sdivzstepv
ls f3,d_sdivzorigin #f3 = d_sdivzorigin
ls f4,d_tdivzstepu #f4 = d_tdivzstepu
ls f5,d_tdivzstepv #f5 = d_tdivzstepv
ls f6,d_tdivzorigin #f6 = d_tdivzorigin
ls f7,d_zistepu #f7 = d_zistepu
ls f8,d_zistepv #f8 = d_zistepv
ls f9,d_ziorigin #f9 = d_ziorigin
lf f10,INT2DBL_0 #for int2dbl_setup
stfd f10,local(r1)
lw r6,sadjust
int2dbl f11,r6,r0,local,f10 #f11 = sadjust
lw r6,tadjust
int2dbl f12,r6,r0,local,f10 #f12 = tadjust
lw r6,bbextents
int2dbl f13,r6,r0,local,f10 #f13 = bbextents
lw r6,bbextentt
int2dbl f14,r6,r0,local,f10 #f14 = bbextentt
lw r6,d_viewbuffer #r6 = d_viewbuffer
lw r7,screenwidth #r7 = screenwidth
ls f15,c65536 #f15 = 65536
ls f16,c16 #f16 = 16
ls f25,c0 #f25 = 0
ls f28,c2 #f28 = 2
fmuls f17,f1,f16 #f17 = sdivz16stepu
fmuls f18,f4,f16 #f18 = tdivz16stepu
fmuls f19,f7,f16 #f19 = zi16stepu
subi r3,r3,4 #prepare for postincrement
###### First loop. In every iteration one complete span is drawn
# pbase = (unsigned char *)cacheblock;
#
# sdivz16stepu = d_sdivzstepu * 16;
# tdivz16stepu = d_tdivzstepu * 16;
# zi16stepu = d_zistepu * 16;
#
# do
# {
# pdest = (unsigned char *)((byte *)d_viewbuffer +
# (screenwidth * pspan->v) + pspan->u);
#
# count = pspan->count;
#
# // calculate the initial s/z, t/z, 1/z, s, and t and clamp
# du = (float)pspan->u;
# dv = (float)pspan->v;
#
# sdivz = d_sdivzorigin + dv*d_sdivzstepv + du*d_sdivzstepu;
# tdivz = d_tdivzorigin + dv*d_tdivzstepv + du*d_tdivzstepu;
# zi = d_ziorigin + dv*d_zistepv + du*d_zistepu;
# z = (float)0x10000 / zi; // prescale to 16.16 fixed-point
#
.d16loop:
lwzu r8,4(r3)
int2dbl f20,r8,r0,local,f10 #f20 = du
lwzu r9,4(r3)
int2dbl f21,r9,r0,local,f10 #f21 = dv
fmadds f24,f21,f8,f9
fmadds f22,f21,f2,f3
fmadds f23,f21,f5,f6
fmadds f26,f20,f7,f24 #f26 = zi
fmadds f22,f20,f1,f22 #f22 = sdivz
fmadds f23,f20,f4,f23 #f23 = tdivz
fmuls f0,f26,f26
frsqrte f0,f0
mullw r10,r9,r7 #r10 = pspan->v * screenwidth
fnmsubs f29,f0,f26,f28
add r10,r10,r8 #r10 = r10 + pspan->u
fmuls f0,f0,f29
add r11,r6,r10 #r11 = pdest
fnmsubs f29,f0,f26,f28
subi r11,r11,1 #prepare for postincrement
fmuls f0,f0,f29
lwzu r12,4(r3) #r12 = count
fmuls f24,f0,f15
fmadds f20,f22,f24,f11 #f20 = s
fsubs f0,f20,f13
fsel f20,f0,f13,f20 #if (s>bb...) s = bbextents
fsel f20,f20,f20,f25 #if (s<0) s = 0
fctiwz f0,f20
stfd f0,local+8(r1)
lwz r8,local+12(r1) #r8 = (int)s
fmadds f21,f23,f24,f12 #f21 = t
fsubs f0,f21,f14
fsel f21,f0,f14,f21 #if (t>bb...) t = bbextentt
fsel f21,f21,f21,f25 #if (t<0) t = 0
fctiwz f0,f21
stfd f0,local+8(r1)
lwz r9,local+12(r1) #r9 = (int)t
li r0,1
dcbtst r11,r0
###### Second loop. In every iteration one part of the whole span is drawn
# do
# {
# // calculate s and t at the far end of the span
# if (count >= 16)
# spancount = 16;
# else
# spancount = count;
#
# count -= spancount;
#
# if (count)
# {
.d16loop2:
cmpwi r12,16
bgt .d16cont
mtctr r12
subi r10,r12,1
int2dbl f21,r10,r0,local,f10 #spancountminus1 = (float)...
li r12,0 #r12 = count -= spancount
fmadds f26,f21,f7,f26 #zi += d_zistepu * ...
b .d16finalpart
.d16cont:
fadds f26,f26,f19 #zi += zi16stepu
li r10,16 #r10 = spancount = 16
fmuls f0,f26,f26
subf r12,r10,r12 #r12 = count -= spancount
frsqrte f0,f0
mtctr r10
####### Evaluation of the values for the inner loop. This version is used for
####### span size = 16
# // calculate s/z, t/z, zi->fixed s and t at far end of span,
# // calculate s and t steps across span by shifting
# sdivz += sdivz16stepu;
# tdivz += tdivz16stepu;
# zi += zi16stepu;
# z = (float)0x10000 / zi; // prescale to 16.16 fixed-point
# snext = (int)(sdivz * z) + sadjust;
# if (snext > bbextents)
# snext = bbextents;
# else if (snext < 16)
# snext = 16; // prevent round-off error on <0 steps from
# // from causing overstepping & running off the
# // edge of the texture
# tnext = (int)(tdivz * z) + tadjust;
# if (tnext > bbextentt)
# tnext = bbextentt;
# else if (tnext < 16)
# tnext = 16; // guard against round-off error on <0 steps
# sstep = (snext - s) >> 4;
# tstep = (tnext - t) >> 4;
# }
fnmsubs f20,f0,f26,f28
fadds f22,f22,f17 #sdivz += sdivz16stepu
fmuls f0,f0,f20
fadds f23,f23,f18 #tdivz += tdivz16stepu
fnmsubs f20,f0,f26,f28
fmuls f0,f0,f20
fmuls f24,f0,f15
fmadds f20,f22,f24,f11 #f20 = snext
fsubs f0,f20,f13
fsel f20,f0,f13,f20 #if (snext>bb...) snext = bbextents
fsubs f0,f20,f16
fsel f20,f0,f20,f16 #if (snext<16) snext = 16
fctiwz f0,f20
stfd f0,local+8(r1)
lwz r31,local+12(r1) #r31 = (int)snext
fmadds f21,f23,f24,f12 #f21 = tnext
fsubs f0,f21,f14
fsel f21,f0,f14,f21 #if (tnext>bb...) tnext = bbextentt
subf r29,r8,r31
fsubs f0,f21,f16
srawi r29,r29,4 #r29 = sstep = (snext - s) >> 4
fsel f21,f0,f21,f16 #if (tnext<16) tnext = 16
fctiwz f0,f21
stfd f0,local+8(r1)
lwz r30,local+12(r1) #r30 = (int)tnext
subf r28,r9,r30
srawi r28,r28,4 #r28 = tstep = (tnext - t) >> 4
b .d16mainloop
###### Evaluation of the values for the inner loop. This version is used for
###### span size < 16
###### The original algorithm has two ugly divisions at the end of this part.
###### These are removed by the following optimization:
###### First, the divisors 1,2 and 4 are handled specially to gain speed. The
###### other divisors are handled using a reciprocal table.
# // calculate s/z, t/z, zi->fixed s and t at last pixel in span (so
# // can't step off polygon), clamp, calculate s and t steps across
# // span by division, biasing steps low so we don't run off the
# // texture
# spancountminus1 = (float)(spancount - 1);
# sdivz += d_sdivzstepu * spancountminus1;
# tdivz += d_tdivzstepu * spancountminus1;
# zi += d_zistepu * spancountminus1;
# z = (float)0x10000 / zi; // prescale to 16.16 fixed-point
# snext = (int)(sdivz * z) + sadjust;
# if (snext > bbextents)
# snext = bbextents;
# else if (snext < 16)
# snext = 16; // prevent round-off error on <0 steps from
# // from causing overstepping & running off the
# // edge of the texture
#
# tnext = (int)(tdivz * z) + tadjust;
# if (tnext > bbextentt)
# tnext = bbextentt;
# else if (tnext < 16)
# tnext = 16; // guard against round-off error on <0 steps
#
# if (spancount > 1)
# {
# sstep = (snext - s) / (spancount - 1);
# tstep = (tnext - t) / (spancount - 1);
# }
# }
.d16finalpart:
fmuls f0,f26,f26
fmadds f22,f21,f1,f22 #sdivz += d_sdivzstepu * ...
frsqrte f0,f0
fmadds f23,f21,f4,f23 #tdivs += d_tdivzstepu * ...
fnmsubs f20,f0,f26,f28
cmplwi r10,5
fmuls f0,f0,f20
fnmsubs f20,f0,f26,f28
fmuls f0,f0,f20
fmuls f24,f0,f15
fmadds f20,f22,f24,f11 #f27 = snext
fsubs f0,f20,f13
fsel f20,f0,f13,f20 #if (snext>bb...) snext = bbextents
fsubs f0,f20,f16
fsel f20,f0,f20,f16 #if (snext<16) snext = 16
fctiwz f0,f20
stfd f0,local+8(r1)
lwz r31,local+12(r1) #r31 = (int)snext
fmadds f21,f23,f24,f12 #f21 = tnext
fsubs f0,f21,f14
fsel f21,f0,f14,f21 #if (tnext>bb...) tnext = bbextentt
subf r29,r8,r31
fsubs f0,f21,f16
fsel f21,f0,f21,f16 #if (tnext<16) tnext = 16
fctiwz f0,f21
stfd f0,local+8(r1)
lwz r30,local+12(r1) #r30 = (int)tnext
subf r28,r9,r30
blt .d16special
.d16qdiv:
slwi r0,r10,2
lwzx r0,r26,r0
mulhw r29,r29,r0
mulhw r28,r28,r0
b .d16mainloop
.d16special:
cmplwi r10,1
ble .d16mainloop
cmplwi r10,3
beq .d16qdiv
blt .d16spec_2
srawi r29,r29,2
srawi r28,r28,2
b .d16mainloop
.d16spec_2:
srawi r29,r29,1
srawi r28,r28,1
###### Main drawing loop. Here lies the speed.
# do
# {
# *pdest++ = *(pbase + (s >> 16) + (t >> 16) * cachewidth);
# s += sstep;
# t += tstep;
# } while (--spancount > 0);
.d16mainloop:
srawi r0,r9,16
srawi r10,r8,16
mullw r0,r0,r4
add r9,r9,r28
add r0,r0,r10
add r8,r8,r29
lbzx r0,r27,r0
stbu r0,1(r11)
bdnz .d16mainloop
mr r8,r31
mr r9,r30
mr. r12,r12
bgt .d16loop2
lwz r3,4(r3) #while (...)
mr. r3,r3
subi r3,r3,4
bne .d16loop
lmw r26,gb(r1)
lfd f14,fb+0*8(r1)
lfd f15,fb+1*8(r1)
lfd f16,fb+2*8(r1)
lfd f17,fb+3*8(r1)
lfd f18,fb+4*8(r1)
lfd f19,fb+5*8(r1)
lfd f20,fb+6*8(r1)
lfd f21,fb+7*8(r1)
lfd f22,fb+8*8(r1)
lfd f23,fb+9*8(r1)
lfd f24,fb+10*8(r1)
lfd f25,fb+11*8(r1)
lfd f26,fb+12*8(r1)
lfd f27,fb+13*8(r1)
lfd f28,fb+14*8(r1)
lfd f29,fb+15*8(r1)
exit
funcend D_DrawSpans16
###########################################################################
#
# void D_DrawZSpans (espan_t *pspan)
# r3
#
# standard z-scan drawing function
#
###########################################################################
funcdef D_DrawZSpans
init 0,16,0,0 # 16 local bytes for int2dbl/dbl2int
lf f1,INT2DBL_0 #for int2dbl_setup
stfd f1,local(r1)
ls f2,cHUGE #f2 = 0x8000*0x10000
ls f3,d_zistepu #f3 = d_zistepu
ls f4,d_zistepv #f4 = d_zistepv
ls f5,d_ziorigin #f5 = d_ziorigin
lw r5,d_pzbuffer #r5 = d_pzbuffer
lw r6,d_zwidth #r6 = d_zwidth
# izistep = (int)(d_zistepu * 0x8000 * 0x10000);
fmuls f6,f3,f2
fctiwz f0,f6
stfd f0,local+8(r1)
lwz r7,local+12(r1) #r7 = izistep
subi r3,r3,4 #prepare for postincrement
# pdest = d_pzbuffer + (d_zwidth * pspan->v) + pspan->u;
#
# count = pspan->count;
#
# // calculate the initial 1/z
# du = (float)pspan->u;
# dv = (float)pspan->v;
#
# zi = d_ziorigin + dv*d_zistepv + du*d_zistepu;
# // we count on FP exceptions being turned off to avoid range problems
# izi = (int)(zi * 0x8000 * 0x10000);
.zloop:
lwzu r8,4(r3) #r8 = pspan->u
lwzu r9,4(r3) #r9 = pspan->v
lwzu r10,4(r3) #r10 = pspan->count
mullw r11,r9,r6
add r11,r11,r8
add r11,r11,r11
add r12,r5,r11 #r12 = pdest = d_pzbuffer + ...
int2dbl f7,r8,r0,local,f1 #f7 = du
int2dbl f8,r9,r0,local,f1 #f8 = dv
fmadds f10,f8,f4,f5
subi r12,r12,4
fmadds f10,f7,f3,f10 #zi = d_ziorigin + ...
fmuls f7,f10,f2
fctiwz f0,f7
stfd f0,local+8(r1)
lwz r8,local+12(r1) #r8 = izi = (int)(zi * ...)
# if ((long)pdest & 0x02)
# {
# *pdest++ = (short)(izi >> 16);
# izi += izistep;
# count--;
# }
andi. r0,r12,2 #if (long)pdest & 0x02
beq .zcont
srawi r0,r8,16
sth r0,4(r12) #*pdest++ (short)(izi >> 16)
addi r12,r12,2
add r8,r8,r7 #izi += izistep
subi r10,r10,1 #count--
.zcont:
# if ((doublecount = count >> 1) > 0)
# {
# do
# {
# ltemp = izi >> 16;
# izi += izistep;
# ltemp |= izi & 0xFFFF0000;
# izi += izistep;
# *(int *)pdest = ltemp;
# pdest += 2;
# } while (--doublecount > 0);
# }
srawi. r0,r10,1 #if ((doublecount = count >> 1))
ble .zcont2
mtctr r0
.zloop2:
srawi r0,r8,16 #ltemp = izi >> 16
add r8,r8,r7 #izi += izistep
inslwi r0,r8,16,0 #ltemp |= izi & 0xFFFF0000
add r8,r8,r7 #izi += izistep
stwu r0,4(r12) #*(int *)pdest = ltemp
bdnz .zloop2
.zcont2:
# if (count & 1)
# *pdest = (short)(izi >> 16);
andi. r0,r10,1 #if (count & 1)
beq .zcont3
srawi r0,r8,16 #*pdest = (short)(izi >> 16)
sth r0,4(r12)
.zcont3:
# } while ((pspan = pspan->pnext) != NULL);
lwz r3,4(r3) #while (...)
mr. r3,r3
subi r3,r3,4
bne .zloop
exit
funcend D_DrawZSpans
.ifdef WOS
.tocd
.else
.data
.endif
lab c8
.float 8.0
lab c16
.float 16.0
lab cHUGE
.float 2147483648.0
lab cSPEED
.float 20
lab cAMP2times2
.float 6.0 # 2 * AMP2
.ifne AMP2-3
.fail "AMP2 must be 3!"
.endif