Enigma Amiga Life 113

home *** CD-ROM | disk | FTP | other *** search

/ Enigma Amiga Life 113 / EnigmaAmiga113CD.iso / software / sviluppo / quake_src / d_scan68k.s < prev next >

Wrap

Text File | 2000-06-17 | 67KB | 2,097 lines

* * Copyright (C) 1996-1997 Id Software, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * ** ** Quake for AMIGA ** d_scan.c assembler implementations by Frank Wille <frank@phoenix.owl.de> ** XREF _cacheblock XREF _d_sdivzorigin XREF _d_sdivzstepu XREF _d_sdivzstepv XREF _d_tdivzorigin XREF _d_tdivzstepu XREF _d_tdivzstepv XREF _d_ziorigin XREF _d_zistepu XREF _d_zistepv XREF _sadjust XREF _tadjust XREF _sdivz XREF _tdivz XREF _bbextents XREF _bbextentt XREF _d_viewbuffer XREF _screenwidth XREF _cachewidth XREF _d_zwidth XREF _d_pzbuffer XREF _sintable XREF _cl XREF _intsintable XREF _vid XREF _scr_vrect XREF _r_refdef XREF _d_subdiv16 XDEF _D_WarpScreen XDEF _Turbulent8 XDEF _D_DrawSpans8 XDEF _D_DrawZSpans QDIV = 1 NICE_DIV = 1 PSPAN_NEXT = $C CL_TIME = $23C VID_BUFFER = 0 VID_ROWBYTES = 16 SCR_VRECT_X = 0 SCR_VRECT_Y = 4 SCR_VRECT_WIDTH = 8 SCR_VRECT_HEIGHT = 12 REFDEF_VRECT_X = 0 REFDEF_VRECT_Y = 4 REFDEF_VRECT_WIDTH = 8 REFDEF_VRECT_HEIGHT = 12 CVAR_VALUE = 16 CYCLE = 128 ;MUST match the #define in d_iface.h! AMP2 = 3 ;-- SPEED = 20 ;-- ****************************************************************************** * * void _D_WarpScreen (void) * * water effect algorithm * ****************************************************************************** cnop 0,4 _D_WarpScreen rsreset .rowptr rs.l 1024 .column rs.l 1280 .stackframe rs.l 0 movem.l d2-d7/a2-a6,-(sp) fmovem.x fp2/fp3,-(sp) sub.l #.stackframe,sp move.l sp,a2 lea .column(sp),a1 lea _vid,a3 lea _r_refdef,a4 lea _scr_vrect,a5 move.l _screenwidth,d4 move.l SCR_VRECT_WIDTH(a5),d6 move.l SCR_VRECT_HEIGHT(a5),d7 move.l REFDEF_VRECT_X(a4),d2 move.l REFDEF_VRECT_Y(a4),d3 fmove.l REFDEF_VRECT_WIDTH(a4),fp0 fmove.l REFDEF_VRECT_HEIGHT(a4),fp1 * w = r_refdef.vrect.width; * h = r_refdef.vrect.height; * * wratio = w / (float)scr_vrect.width; * hratio = h / (float)scr_vrect.height; fmove.s #AMP2*2,fp2 fmove fp2,fp3 fadd fp0,fp2 ;fp2 = w + AMP2*2 fadd fp1,fp3 ;fp3 = h + AMP2*2 fmul.l d6,fp2 ;* (float)scr_vrect.width fmul.l d7,fp3 ;* (float)scr_vrect.height fmul fp0,fp0 ;w*w fmul fp1,fp1 ;h*h fdiv fp2,fp0 ;fp0=wratio*w/(w+AMP2*2) fdiv fp3,fp1 ;fp1=hratio*h/(h+AMP2*2) move.l VID_ROWBYTES(a3),d5 mulu d4,d3 ;d3=r_refdef.vrect.y*screenwidth add.l _d_viewbuffer,d3 ;d3=d_viewbuffer+d3 add.l #AMP2*2,d6 add.l #AMP2*2,d7 * for (v=0 ; v<scr_vrect.height+AMP2*2 ; v++) * { * rowptr[v] = d_viewbuffer + (r_refdef.vrect.y * screenwidth) + * (screenwidth * (int)((float)v * hratio * h / (h + AMP2 * 2))); * } moveq #0,d0 ;v = 0 move.l a2,a6 ;a6 -> rowptr[0] .loop fmove.l d0,fp3 ;fp3 = (float)v fmul fp1,fp3 ;(float)v*hratio*h/(h+AMP2*2) fmove.l fp3,d1 ;d1 = (int)fp3 muls d4,d1 ;d1 = d1 * screenwidth add.l d3,d1 ;d1 = d_viewbuffer+(r_...*scr...)+d1 addq.l #1,d0 ;v++ move.l d1,(a6)+ ;rowptr[v] = d1 cmp.l d7,d0 blt.b .loop * for (u=0 ; u<scr_vrect.width+AMP2*2 ; u++) * { * column[u] = r_refdef.vrect.x + * (int)((float)u * wratio * w / (w + AMP2 * 2)); * } moveq #0,d0 ;u = 0 move.l a1,a6 ;a6 -> column[0] .loop2 fmove.l d0,fp2 ;fp2 = (float)u fmul fp0,fp2 ;(float)u * wratio*w/(w+AMP2*2) fmove.l fp2,d1 ;d1 = (int)fp2 add.l d2,d1 ;d1 = r_refdef.vrect.x + d1 addq.l #1,d0 ;u++ move.l d1,(a6)+ ;column[u] = d1 cmp.l d6,d0 blt.b .loop2 ****** d5 = vid.rowbytes ****** a1 -> column ****** a2 -> rowptr * turb = intsintable + ((int)(cl.time*SPEED)&(CYCLE-1)); * dest = vid.buffer + scr_vrect.y * vid.rowbytes + scr_vrect.x; * for (v=0 ; v<scr_vrect.height ; v++, dest += vid.rowbytes) * { * col = &column[turb[v]]; * row = &rowptr[v]; * for (u=0 ; u<scr_vrect.width ; u+=4) * { * dest[u+0] = row[turb[u+0]][col[u+0]]; * dest[u+1] = row[turb[u+1]][col[u+1]]; * dest[u+2] = row[turb[u+2]][col[u+2]]; * dest[u+3] = row[turb[u+3]][col[u+3]]; * } * } move.l SCR_VRECT_WIDTH(a5),d6 lsr #2,d6 subq #1,d6 move.l SCR_VRECT_HEIGHT(a5),d7 fmove.d _cl+CL_TIME,fp0 ;get cl.time fmul.s #SPEED,fp0 ;fp0 = cl.time*SPEED fmove.l fp0,d4 ;(int)(cl.time*SPEED) and.l #CYCLE-1,d4 ;(int)(cl.time*SPEED)&(CYCLE-1) lsl.l #2,d4 add.l #_intsintable,d4 ;turb = _intsintable + 4*d0 move.l SCR_VRECT_Y(a5),d3 mulu d5,d3 ;vid.rowbytes * scr_vrect.y add.l SCR_VRECT_X(a5),d3 ;d3 + scr_vrect.x add.l VID_BUFFER(a3),d3 ;dest = vid.buffer + d3 moveq #0,d1 .loop3 move d6,d0 move.l d4,a6 ;a6 -> turb[u] move.l 0(a6,d1.l*4),d2 ;d2 = turb[v] move.l d3,a0 ;a0 -> dest[u] lea 0(a1,d2.l*4),a4 ;col = &column[turb[v]] lea 0(a2,d1.l*4),a5 ;row = &rowptr[v] .loop4 move.l (a6)+,d2 ;d2 = turb[u+0] move.l 0(a5,d2.l*4),a3 ;a3 = row[turb[u+0]] move.l (a4)+,d2 ;d2 = col[u+0] move.b 0(a3,d2.l),(a0)+ ;dest[u+0]=row[turb[u+0][col[u+0]] move.l (a6)+,d2 ;same for u=1,2,3 move.l 0(a5,d2.l*4),a3 move.l (a4)+,d2 move.b 0(a3,d2.l),(a0)+ move.l (a6)+,d2 move.l 0(a5,d2.l*4),a3 move.l (a4)+,d2 move.b 0(a3,d2.l),(a0)+ move.l (a6)+,d2 move.l 0(a5,d2.l*4),a3 move.l (a4)+,d2 move.b 0(a3,d2.l),(a0)+ dbra d0,.loop4 add.l d5,d3 addq #1,d1 cmp d7,d1 blt.b .loop3 add.l #.stackframe,sp fmovem.x (sp)+,fp2/fp3 movem.l (sp)+,d2-d7/a2-a6 rts ****************************************************************************** * * void Turbulent8 (espan_t *pspan) * * standard scan drawing function for animated textures * Note: The function D_DrawTurbulent8Span was inlined into this * function, because it's never used anywhere else. * ****************************************************************************** cnop 0,4 _Turbulent8 ***** stackframe rsreset .saved4 rs.l 1 .saved5 rs.l 1 .savea1 rs.l 1 .szstpu rs.s 1 .szstpv rs.s 1 .szorg rs.s 1 .tzstpu rs.s 1 .tzstpv rs.s 1 .tzorg rs.s 1 .zistpu rs.s 1 .zistpv rs.s 1 .ziorg rs.s 1 .fpuregs rs.x 6 .intregs rs.l 11 rs.l 1 .pspan rs.l 1 ****** Prologue. Global variables are put into registers or onto the stackframe movem.l d2-d7/a2-a6,-(sp) fmovem.x fp2-fp7,-(sp) move.l _bbextentt,a2 move.l _tadjust,a3 move.l _bbextents,a4 move.l _sadjust,a5 move.l _d_ziorigin,-(sp) move.l _d_zistepv,-(sp) move.l _d_zistepu,-(sp) move.l _d_tdivzorigin,-(sp) move.l _d_tdivzstepv,-(sp) move.l _d_tdivzstepu,-(sp) move.l _d_sdivzorigin,-(sp) move.l _d_sdivzstepv,-(sp) move.l _d_sdivzstepu,-(sp) sub.l #.szstpu,sp ****** First loop. In every iteration one complete span is drawn * r_turb_turb = sintable + ((int)(cl.time*SPEED)&(CYCLE-1)); * * r_turb_pbase = (unsigned char *)cacheblock; * * sdivz16stepu = d_sdivzstepu * 16; * tdivz16stepu = d_tdivzstepu * 16; * zi16stepu = d_zistepu * 16; * * do * { * r_turb_pdest = (unsigned char *)((byte *)d_viewbuffer + * (screenwidth * pspan->v) + pspan->u); * * count = pspan->count; * * // calculate the initial s/z, t/z, 1/z, s, and t and clamp * du = (float)pspan->u; * dv = (float)pspan->v; * * sdivz = d_sdivzorigin + dv*d_sdivzstepv + du*d_sdivzstepu; * tdivz = d_tdivzorigin + dv*d_tdivzstepv + du*d_tdivzstepu; * zi = d_ziorigin + dv*d_zistepv + du*d_zistepu; * z = (float)0x10000 / zi; // prescale to 16.16 fixed-point * fmove.d _cl+CL_TIME,fp0 ;get cl.time fmul.s #SPEED,fp0 ;fp0 = cl.time*SPEED fmove.l fp0,d0 ;(int)(cl.time*SPEED) and.l #CYCLE-1,d0 ;(int)(cl.time*SPEED)&(CYCLE-1) lsl.l #2,d0 add.l #_sintable,d0 ;r_turb_turb = _sintable + 4*d0 move.l d0,a6 fmove.s #16,fp7 fmove.s .szstpu(sp),fp3 fmul fp7,fp3 ;sdivz16stepu = d_sdivzstepu * 16 fmove.s .tzstpu(sp),fp4 fmul fp7,fp4 ;tdivz16stepu = d_tdivzstepu * 16 fmove.s .zistpu(sp),fp5 fmul fp7,fp5 ;zi16stepu = d_zistepu * 16 move.l .pspan(sp),a1 ;get function parameter .loop move.l a1,.savea1(sp) ;save actual ptr to pspan move.l _d_viewbuffer,a0 move.l _screenwidth,d0 move.l (a1)+,d1 fmove.l d1,fp2 ;du = (float)pspan->u move.l (a1)+,d2 fmove.l d2,fp7 ;dv = (float)pspan->v move.l (a1)+,d4 muls d2,d0 ;d0 = screenwidth * pspan->v add.l d1,d0 add.l d0,a0 ;pdest = d_viewbuffer + pspan->u + d0 lea .szstpu(sp),a1 ;a1 -> stackframe fmove.s (a1)+,fp0 fmul fp2,fp0 ;fp0 = du * d_sdivzstepu fmove.s (a1)+,fp1 fmul fp7,fp1 ;fp1 = dv * d_sdivzstepv fadd fp1,fp0 fadd.s (a1)+,fp0 ;sdivz = d_sdivzorigin + fp0 + fp1 fmove.s (a1)+,fp1 fmul fp2,fp1 ;fp1 = du * d_tdivzstepu fmove.s (a1)+,fp6 fmul fp7,fp6 ;fp6 = dv * d_tdivzstepv fadd fp6,fp1 fadd.s (a1)+,fp1 ;tdivz = d_tdivzorigin + fp1 + fp6 fmul.s (a1)+,fp2 ;fp2 = du * d_zistepu fmul.s (a1)+,fp7 ;fp7 = dv * d_zistepv fadd fp7,fp2 fadd.s (a1)+,fp2 ;zi = d_ziorigin + fp2 + fp7 fmove.s #65536,fp6 fdiv fp2,fp6 ;z = (float)0x10000 / zi * s = (int)(sdivz * z) + sadjust; * if (s > bbextents) * s = bbextents; * else if (s < 0) * s = 0; * * t = (int)(tdivz * z) + tadjust; * if (t > bbextentt) * t = bbextentt; * else if (t < 0) * t = 0; fmove fp6,fp7 fmul fp0,fp7 ;fp7 = sdivz * z fmove.l fp7,d6 ;convert to integer add.l a5,d6 ;s = d6 + sadjust cmp.l a4,d6 ;if (s > bbextents) bgt.b .down tst.l d6 ;if (s < 0) bge.b .keep .up moveq #0,d6 ;s = 0 bra.b .keep .down move.l a4,d6 ;s = bbextents .keep fmul fp1,fp6 ;fp6 = tdivz * z fmove.l fp6,d7 ;convert to integer add.l a3,d7 ;t = d7 + tadjust cmp.l a2,d7 ;if (t > bbextentt) bgt.b .down2 tst.l d7 ;if (t < 0) bge.b .keep2 .up2 moveq #0,d7 ;t = 0 bra.b .keep2 .down2 move.l a2,d7 ;t = bbextentt .keep2 move.l d4,d1 ****** Second loop. In every iteration one part of the whole span is drawn ****** d2 gets the value (spancount-1)! [NOT spancount] ****** d1 = count * do * { * // calculate s and t at the far end of the span * if (count >= 16) * spancount = 16; * else * spancount = count; * * count -= spancount; * * if (count) * { .loop2 moveq #16-1,d2 ;spancount = 16 cmp.l #16,d1 ;if (count >= 16) bgt.b .cont move.l d1,d2 ;spancount = count subq.l #1,d2 moveq #0,d1 ;count -= spancount bra.w .finalpart .cont sub.l #16,d1 ;count -= spancount; ****** Evaluation of the values for the inner loop. This version is used for ****** span size = 16 ****** a2 : bbextentt ****** a3 : tadjust ****** a4 : bbextents ****** a5 : sadjust ****** fp0 : sdivz ****** fp1 : tdivz ****** fp2 : zi ****** fp3 : sdivz16stepu ****** fp4 : tdivz16stepu ****** fp5 : zi16stepu * // calculate s/z, t/z, zi->fixed s and t at far end of span, * // calculate s and t steps across span by shifting * sdivz += sdivz16stepu; * tdivz += tdivz16stepu; * zi += zi16stepu; * z = (float)0x10000 / zi; // prescale to 16.16 fixed-point * * snext = (int)(sdivz * z) + sadjust; * if (snext > bbextents) * snext = bbextents; * else if (snext < 16) * snext = 16; // prevent round-off error on <0 steps from * // from causing overstepping & running off the * // edge of the texture * * tnext = (int)(tdivz * z) + tadjust; * if (tnext > bbextentt) * tnext = bbextentt; * else if (tnext < 16) * tnext = 16; // guard against round-off error on <0 steps * * r_turb_sstep = (snext - r_turb_s) >> 4; * r_turb_tstep = (tnext - r_turb_t) >> 4; * } fadd fp3,fp0 ;sdivz += sdivz16stepu fadd fp4,fp1 ;tdivz += tdivz16stepu fadd fp5,fp2 ;zi += zi16stepu fmove.s #65536,fp7 fdiv fp2,fp7 ;z = (float)0x10000 / zi; fmove fp7,fp6 fmul fp0,fp6 ;fp2 = sdivz * z fmove.l fp6,d4 ;convert to integer add.l a5,d4 ;snext = d4 + sadjust cmp.l a4,d4 ;if (snext > bbextents) bgt.b .down3 cmp.l #16,d4 ;if (snext < 16) bge.b .keep3 .up3 moveq #16,d4 ;snext = 16 bra.b .keep3 .down3 move.l a4,d4 ;snext = bbextents .keep3 fmul fp1,fp7 ;fp7 = tdivz * z fmove.l fp7,d5 ;convert to integer add.l a3,d5 ;tnext = d5 + tadjust cmp.l a2,d5 ;if (tnext > bbextentt) bgt.b .down4 cmp.l #16,d5 ;if (tnext < 16) bge.b .keep4 .up4 moveq #16,d5 ;tnext = 16 bra.b .keep4 .down4 move.l a2,d5 ;tnext = bbextentt .keep4 move.l d4,.saved4(sp) ;save snext move.l d5,.saved5(sp) ;save tnext sub.l d6,d4 ;d4 = snext - s sub.l d7,d5 ;d5 = tnext - t asr.l #4,d4 ;r_turb_sstep = d4 >> 4 asr.l #4,d5 ;r_turb_tstep = d5 >> 4 bra.w .mainloop ****** Evaluation of the values for the inner loop. This version is used for ****** span size < 16 ****** The original algorithm has two ugly divisions at the end of this part. ****** These are removed by the following optimization: ****** First, the divisors 1,2 and 4 are handled specially to gain speed. The ****** other divisors are handled using a reciprocal table. ****** a2 : bbextentt ****** a3 : tadjust ****** a4 : bbextents ****** a5 : sadjust ****** fp0 : sdivz ****** fp1 : tdivz ****** fp2 : zi * // calculate s/z, t/z, zi->fixed s and t at last pixel in span (so * // can't step off polygon), clamp, calculate s and t steps across * // span by division, biasing steps low so we don't run off the * // texture * spancountminus1 = (float)(r_turb_spancount - 1); * sdivz += d_sdivzstepu * spancountminus1; * tdivz += d_tdivzstepu * spancountminus1; * zi += d_zistepu * spancountminus1; * z = (float)0x10000 / zi; // prescale to 16.16 fixed-point * snext = (int)(sdivz * z) + sadjust; * if (snext > bbextents) * snext = bbextents; * else if (snext < 16) * snext = 16; // prevent round-off error on <0 steps from * // from causing overstepping & running off the * // edge of the texture * * tnext = (int)(tdivz * z) + tadjust; * if (tnext > bbextentt) * tnext = bbextentt; * else if (tnext < 16) * tnext = 16; // guard against round-off error on <0 steps * * if (r_turb_spancount > 1) * { * r_turb_sstep = (snext - r_turb_s) / (r_turb_spancount - 1); * r_turb_tstep = (tnext - r_turb_t) / (r_turb_spancount - 1); * } * } .finalpart fmove.l d2,fp7 ;spancountminus1 = (float)(r_turb_spancount-1) fmove fp7,fp6 fmul.s .szstpu(sp),fp6 ;fp6 = d_sdivzstepu * spancountminus1 fadd fp6,fp0 ;sdivz += fp6 fmove fp7,fp6 fmul.s .tzstpu(sp),fp6 ;fp6 = d_tdivzstepu * spancountminus1 fadd fp6,fp1 ;tdivz += fp6 fmul.s .zistpu(sp),fp7 ;fp7 = d_zistepu * spancountminus1 fadd fp7,fp2 ;zi += fp7 fmove.s #65536,fp7 fdiv fp2,fp7 ;z = (float)0x10000 / zi; fmove fp7,fp6 fmul fp0,fp6 ;fp6 = sdivz * z fmove.l fp6,d4 ;convert to integer add.l a5,d4 ;snext = d4 + sadjust cmp.l a4,d4 ;if (snext > bbextents) bgt.b .down5 cmp.l #16,d4 ;if (snext < 16) bge.b .keep5 .up5 moveq #16,d4 ;snext = 16 bra.b .keep5 .down5 move.l a4,d4 ;snext = bbextents .keep5 fmul fp1,fp7 ;fp7 = tdivz * z fmove.l fp7,d5 ;convert to integer add.l a3,d5 ;tnext = d5 + tadjust cmp.l a2,d5 ;if (tnext > bbextentt) bgt.b .down6 cmp.l #16,d5 ;if (tnext < 16) bge.b .keep6 .up6 moveq #16,d5 ;tnext = 16 bra.b .keep6 .down6 move.l a2,d5 ;tnext = bbextentt .keep6 move.l d4,.saved4(sp) ;save snext move.l d5,.saved5(sp) ;save tnext sub.l d6,d4 ;d4 = snext - r_turb_s sub.l d7,d5 ;d5 = tnext - r_turb_t IFEQ QDIV tst.l d2 beq.w .mainloop divs.l d2,d4 divs.l d2,d5 ELSEIF cmp #5,d2 ;(r_turb_spancount-1) < 5? blt.b .special ;yes -> special case cmp #8,d2 beq.b .spec_8 .qdiv IFNE NICE_DIV lsl.l #2,d4 lsl.l #2,d5 lea ReciprocTable,a1 move 0(a1,d2.w*2),d0 move.l d4,d3 mulu d0,d3 clr d3 swap d3 swap d4 muls d0,d4 add.l d3,d4 move.l d5,d3 mulu d0,d3 clr d3 swap d3 swap d5 muls d0,d5 add.l d3,d5 bra.b .mainloop ELSEIF asr.l #7,d4 ;d4 >> 7 asr.l #7,d5 ;d5 >> 7 lea ReciprocTable,a1 ;a1 -> reciprocal table move 0(a1,d2.w*2),d0 ;d0 = (1/(r_turb_spancount-1))<<16 muls d0,d4 ;d4 = d4 / (r_turb_spancount-1) asr.l #7,d4 ;sstep = d4 >> 7 muls d0,d5 ;d5 = d5 / (r_turb_spancount-1) asr.l #7,d5 ;tstep = d5 >> 7 bra.b .mainloop ENDC .special cmp #1,d2 ;switch (r_turb_spancount-1) ble.b .mainloop ;0,1 -> no scaling needed cmp #3,d2 ;3 -> standard qdiv beq.b .qdiv blt.b .spec_2 asr.l #2,d4 ;4 -> scale by shifting right asr.l #2,d5 bra.b .mainloop .spec_8 asr.l #3,d4 ;8 -> scale by shifting right asr.l #3,d5 bra.b .mainloop .spec_2 asr.l #1,d4 ;2 -> scale by shifting right asr.l #1,d5 ENDC ****** D_DrawTurbulent8Span (inlined) ****** Main drawing loop. ****** d2 : r_turb_spancount ****** d4 : r_turb_sstep ****** d5 : r_turb_tstep ****** d6 : r_turb_s ****** d7 : r_turb_t ****** a0 : r_turb_pdest ****** a6 : r_turb_turb * do * { * sturb = ((r_turb_s + r_turb_turb[(r_turb_t>>16)&(CYCLE-1)])>>16)&63; * tturb = ((r_turb_t + r_turb_turb[(r_turb_s>>16)&(CYCLE-1)])>>16)&63; * *r_turb_pdest++ = *(r_turb_pbase + (tturb<<6) + sturb); * r_turb_s += r_turb_sstep; * r_turb_t += r_turb_tstep; * } while (--r_turb_spancount > 0); .mainloop move.l d1,-(sp) move.l _cacheblock,a1 ;pbase = (unsigned char *)cacheblock moveq #10,d1 .draw swap d6 ;r_turb_s >> 16 swap d7 ;r_turb_t >> 16 and #CYCLE-1,d6 ;(r_turb_s >> 16) & (CYCLE-1) and #CYCLE-1,d7 ;(r_turb_t >> 16) & (CYCLE-1) move.l 0(a6,d7.w*4),d0 ;r_turb_turb [d7] move.l 0(a6,d6.w*4),d3 ;r_turb_turb [d6] swap d6 swap d7 add.l d6,d0 ;r_turb_s + r_turb_turb [] add.l d7,d3 ;r_turb_t + r_turb_turb [] swap d0 ;d0 >> 16 and.l #$3f,d0 ;sturb = (d0 >> 16) & 63 lsr.l d1,d3 ;(d3 >> (16-6)) and.l #$fc0,d3 ;tturb<<6 = (d3 >> (16-6)) & (63 << 6) add.l d3,d0 ;sturb + tturb << 6 move.b 0(a1,d0.l),(a0)+ ;*r_turb_pdest++ = *(r_turb_pbase + d0) add.l d4,d6 ;r_turb_s += r_turb_sstep add.l d5,d7 ;r_turb_t += r_turb_tstep dbra d2,.draw ;while (--r_turb_spancount > 0) move.l (sp)+,d1 ****** loop terminations move.l .saved5(sp),d7 ;r_turb_t = tnext move.l .saved4(sp),d6 ;r_turb_s = snext tst.l d1 ;while (count > 0) bgt.w .loop2 move.l .savea1(sp),a1 ;while ((pspan = pspan->next) != NULL) move.l PSPAN_NEXT(a1),a1 tst.l a1 bne.w .loop add.l #.fpuregs,sp fmovem.x (sp)+,fp2-fp7 movem.l (sp)+,d2-d7/a2-a6 rts ****************************************************************************** * * void D_DrawSpans8 (espan_t *pspan) * * standard scan drawing function (8 pixel subdivision) * ****************************************************************************** cnop 0,4 _D_DrawSpans8 ***** stackframe rsreset .saved4 rs.l 1 .saved5 rs.l 1 .savea6 rs.l 1 .szstpu rs.s 1 .szstpv rs.s 1 .szorg rs.s 1 .tzstpu rs.s 1 .tzstpv rs.s 1 .tzorg rs.s 1 .zistpu rs.s 1 .zistpv rs.s 1 .ziorg rs.s 1 .fpuregs rs.x 6 .intregs rs.l 11 rs.l 1 .pspan rs.l 1 ****** Prologue. Global variables are put into registers or onto the stackframe fmove.s _d_subdiv16+CVAR_VALUE,fp0 fcmp.s #0,fp0 fbne _D_DrawSpans16 movem.l d2-d7/a2-a6,-(sp) fmovem.x fp2-fp7,-(sp) move.l _bbextentt,a2 move.l _tadjust,a3 move.l _bbextents,a4 move.l _sadjust,a5 move.l _d_ziorigin,-(sp) move.l _d_zistepv,-(sp) move.l _d_zistepu,-(sp) move.l _d_tdivzorigin,-(sp) move.l _d_tdivzstepv,-(sp) move.l _d_tdivzstepu,-(sp) move.l _d_sdivzorigin,-(sp) move.l _d_sdivzstepv,-(sp) move.l _d_sdivzstepu,-(sp) sub.l #.szstpu,sp ****** First loop. In every iteration one complete span is drawn * pbase = (unsigned char *)cacheblock; * * sdivz8stepu = d_sdivzstepu * 8; * tdivz8stepu = d_tdivzstepu * 8; * zi8stepu = d_zistepu * 8; * * do * { * pdest = (unsigned char *)((byte *)d_viewbuffer + * (screenwidth * pspan->v) + pspan->u); * * count = pspan->count; * * // calculate the initial s/z, t/z, 1/z, s, and t and clamp * du = (float)pspan->u; * dv = (float)pspan->v; * * sdivz = d_sdivzorigin + dv*d_sdivzstepv + du*d_sdivzstepu; * tdivz = d_tdivzorigin + dv*d_tdivzstepv + du*d_tdivzstepu; * zi = d_ziorigin + dv*d_zistepv + du*d_zistepu; * z = (float)0x10000 / zi; // prescale to 16.16 fixed-point * move.l _cacheblock,a1 ;pbase = (unsigned char *)cacheblock fmove.s #8,fp7 fmove.s .szstpu(sp),fp3 fmul fp7,fp3 ;sdivz8stepu = d_sdivzstepu * 8 fmove.s .tzstpu(sp),fp4 fmul fp7,fp4 ;tdivz8stepu = d_tdivzstepu * 8 fmove.s .zistpu(sp),fp5 fmul fp7,fp5 ;zi8stepu = d_zistepu * 8 move.l .pspan(sp),a6 ;get function parameter .loop move.l a6,.savea6(sp) ;save actual ptr to pspan move.l _d_viewbuffer,a0 move.l _screenwidth,d0 move.l (a6)+,d1 fmove.l d1,fp2 ;du = (float)pspan->u move.l (a6)+,d2 fmove.l d2,fp7 ;dv = (float)pspan->v move.l (a6)+,d4 muls d2,d0 ;d0 = screenwidth * pspan->v add.l d1,d0 add.l d0,a0 ;pdest = d_viewbuffer + pspan->u + d0 lea .szstpu(sp),a6 ;a6 -> stackframe fmove.s (a6)+,fp0 fmul fp2,fp0 ;fp0 = du * d_sdivzstepu fmove.s (a6)+,fp1 fmul fp7,fp1 ;fp1 = dv * d_sdivzstepv fadd fp1,fp0 fadd.s (a6)+,fp0 ;sdivz = d_sdivzorigin + fp0 + fp1 fmove.s (a6)+,fp1 fmul fp2,fp1 ;fp1 = du * d_tdivzstepu fmove.s (a6)+,fp6 fmul fp7,fp6 ;fp6 = dv * d_tdivzstepv fadd fp6,fp1 fadd.s (a6)+,fp1 ;tdivz = d_tdivzorigin + fp1 + fp6 fmul.s (a6)+,fp2 ;fp2 = du * d_zistepu fmul.s (a6)+,fp7 ;fp7 = dv * d_zistepv fadd fp7,fp2 fadd.s (a6)+,fp2 ;zi = d_ziorigin + fp2 + fp7 fmove.s #65536,fp6 fdiv fp2,fp6 ;z = (float)0x10000 / zi * s = (int)(sdivz * z) + sadjust; * if (s > bbextents) * s = bbextents; * else if (s < 0) * s = 0; * * t = (int)(tdivz * z) + tadjust; * if (t > bbextentt) * t = bbextentt; * else if (t < 0) * t = 0; fmove fp6,fp7 fmul fp0,fp7 ;fp7 = sdivz * z fmove.l fp7,d6 ;convert to integer add.l a5,d6 ;s = d6 + sadjust cmp.l a4,d6 ;if (s > bbextents) bgt.b .down tst.l d6 ;if (s < 0) bge.b .keep .up moveq #0,d6 ;s = 0 bra.b .keep .down move.l a4,d6 ;s = bbextents .keep fmul fp1,fp6 ;fp6 = tdivz * z fmove.l fp6,d7 ;convert to integer add.l a3,d7 ;t = d7 + tadjust cmp.l a2,d7 ;if (t > bbextentt) bgt.b .down2 tst.l d7 ;if (t < 0) bge.b .keep2 .up2 moveq #0,d7 ;t = 0 bra.b .keep2 .down2 move.l a2,d7 ;t = bbextentt .keep2 move.l d4,d1 ****** Second loop. In every iteration one part of the whole span is drawn ****** d2 gets the value (spancount-1)! [NOT spancount] ****** d1 = count * do * { * // calculate s and t at the far end of the span * if (count >= 8) * spancount = 8; * else * spancount = count; * * count -= spancount; * * if (count) * { .loop2 moveq #8-1,d2 ;spancount = 8 cmp.l #8,d1 ;if (count >= 8) bgt.b .cont move.l d1,d2 ;spancount = count subq.l #1,d2 moveq #0,d1 ;count -= spancount bra.w .finalpart .cont subq.l #8,d1 ;count -= spancount; ****** Evaluation of the values for the inner loop. This version is used for ****** span size = 8 ****** a2 : bbextentt ****** a3 : tadjust ****** a4 : bbextents ****** a5 : sadjust ****** fp0 : sdivz ****** fp1 : tdivz ****** fp2 : zi ****** fp3 : sdivz8stepu ****** fp4 : tdivz8stepu ****** fp5 : zi8stepu * // calculate s/z, t/z, zi->fixed s and t at far end of span, * // calculate s and t steps across span by shifting * sdivz += sdivz8stepu; * tdivz += tdivz8stepu; * zi += zi8stepu; * z = (float)0x10000 / zi; // prescale to 16.16 fixed-point * snext = (int)(sdivz * z) + sadjust; * if (snext > bbextents) * snext = bbextents; * else if (snext < 8) * snext = 8; // prevent round-off error on <0 steps from * // from causing overstepping & running off the * // edge of the texture * tnext = (int)(tdivz * z) + tadjust; * if (tnext > bbextentt) * tnext = bbextentt; * else if (tnext < 8) * tnext = 8; // guard against round-off error on <0 steps * sstep = (snext - s) >> 3; * tstep = (tnext - t) >> 3; * } fadd fp3,fp0 ;sdivz += sdivz8stepu fadd fp4,fp1 ;tdivz += tdivz8stepu fadd fp5,fp2 ;zi += zi8stepu fmove.s #65536,fp7 fdiv fp2,fp7 ;z = (float)0x10000 / zi; fmove fp7,fp6 fmul fp0,fp6 ;fp2 = sdivz * z fmove.l fp6,d4 ;convert to integer add.l a5,d4 ;snext = d4 + sadjust cmp.l a4,d4 ;if (snext > bbextents) bgt.b .down3 cmp.l #8,d4 ;if (snext < 8) bge.b .keep3 .up3 moveq #8,d4 ;snext = 8 bra.b .keep3 .down3 move.l a4,d4 ;snext = bbextents .keep3 fmul fp1,fp7 ;fp7 = tdivz * z fmove.l fp7,d5 ;convert to integer add.l a3,d5 ;tnext = d5 + tadjust cmp.l a2,d5 ;if (tnext > bbextentt) bgt.b .down4 cmp.l #8,d5 ;if (tnext < 8) bge.b .keep4 .up4 moveq #8,d5 ;tnext = 8 bra.b .keep4 .down4 move.l a2,d5 ;tnext = bbextentt .keep4 move.l d4,.saved4(sp) ;save snext move.l d5,.saved5(sp) ;save tnext sub.l d6,d4 ;d4 = snext - s sub.l d7,d5 ;d5 = tnext - t asr.l #3,d4 ;sstep = d4 >> 3 asr.l #3,d5 ;tstep = d5 >> 3 bra.w .mainloop ****** Evaluation of the values for the inner loop. This version is used for ****** span size < 8 ****** The original algorithm has two ugly divisions at the end of this part. ****** These are removed by the following optimization: ****** First, the divisors 1,2 and 4 are handled specially to gain speed. The ****** other divisors are handled using a reciprocal table. ****** a2 : bbextentt ****** a3 : tadjust ****** a4 : bbextents ****** a5 : sadjust ****** fp0 : sdivz ****** fp1 : tdivz ****** fp2 : zi * // calculate s/z, t/z, zi->fixed s and t at last pixel in span (so * // can't step off polygon), clamp, calculate s and t steps across * // span by division, biasing steps low so we don't run off the * // texture * spancountminus1 = (float)(spancount - 1); * sdivz += d_sdivzstepu * spancountminus1; * tdivz += d_tdivzstepu * spancountminus1; * zi += d_zistepu * spancountminus1; * z = (float)0x10000 / zi; // prescale to 16.16 fixed-point * snext = (int)(sdivz * z) + sadjust; * if (snext > bbextents) * snext = bbextents; * else if (snext < 8) * snext = 8; // prevent round-off error on <0 steps from * // from causing overstepping & running off the * // edge of the texture * * tnext = (int)(tdivz * z) + tadjust; * if (tnext > bbextentt) * tnext = bbextentt; * else if (tnext < 8) * tnext = 8; // guard against round-off error on <0 steps * * if (spancount > 1) * { * sstep = (snext - s) / (spancount - 1); * tstep = (tnext - t) / (spancount - 1); * } * } .finalpart fmove.l d2,fp7 ;spancountminus1 = (float)(spancount-1) fmove fp7,fp6 fmul.s .szstpu(sp),fp6 ;fp6 = d_sdivzstepu * spancountminus1 fadd fp6,fp0 ;sdivz += fp6 fmove fp7,fp6 fmul.s .tzstpu(sp),fp6 ;fp6 = d_tdivzstepu * spancountminus1 fadd fp6,fp1 ;tdivz += fp6 fmul.s .zistpu(sp),fp7 ;fp7 = d_zistepu * spancountminus1 fadd fp7,fp2 ;zi += fp7 fmove.s #65536,fp7 fdiv fp2,fp7 ;z = (float)0x10000 / zi; fmove fp7,fp6 fmul fp0,fp6 ;fp6 = sdivz * z fmove.l fp6,d4 ;convert to integer add.l a5,d4 ;snext = d4 + sadjust cmp.l a4,d4 ;if (snext > bbextents) bgt.b .down5 cmp.l #8,d4 ;if (snext < 8) bge.b .keep5 .up5 moveq #8,d4 ;snext = 8 bra.b .keep5 .down5 move.l a4,d4 ;snext = bbextents .keep5 fmul fp1,fp7 ;fp7 = tdivz * z fmove.l fp7,d5 ;convert to integer add.l a3,d5 ;tnext = d5 + tadjust cmp.l a2,d5 ;if (tnext > bbextentt) bgt.b .down6 cmp.l #8,d5 ;if (tnext < 8) bge.b .keep6 .up6 moveq #8,d5 ;tnext = 8 bra.b .keep6 .down6 move.l a2,d5 ;tnext = bbextentt .keep6 move.l d4,.saved4(sp) ;save snext move.l d5,.saved5(sp) ;save tnext sub.l d6,d4 ;d4 = snext - s sub.l d7,d5 ;d5 = tnext - t IFEQ QDIV tst.l d2 beq.w .mainloop divs.l d2,d4 divs.l d2,d5 ELSEIF cmp #5,d2 ;(spancount-1) < 5? blt.b .special ;yes -> special case .qdiv IFNE NICE_DIV lsl.l #2,d4 lsl.l #2,d5 lea ReciprocTable,a6 move 0(a6,d2.w*2),d0 move.l d4,d3 mulu d0,d3 clr d3 swap d3 swap d4 muls d0,d4 add.l d3,d4 move.l d5,d3 mulu d0,d3 clr d3 swap d3 swap d5 muls d0,d5 add.l d3,d5 bra.b .mainloop ELSEIF asr.l #7,d4 ;d4 >> 7 asr.l #7,d5 ;d5 >> 7 lea ReciprocTable,a6 ;a6 -> reciprocal table move 0(a6,d2.w*2),d0 ;d0 = (1/(spancount-1))<<16 muls d0,d4 ;d4 = d4 / (spancount-1) asr.l #7,d4 ;sstep = d4 >> 7 muls d0,d5 ;d5 = d5 / (spancount-1) asr.l #7,d5 ;tstep = d5 >> 7 bra.b .mainloop ENDC .special cmp #1,d2 ;switch (spancount-1) ble.b .mainloop ;0,1 -> no scaling needed cmp #3,d2 ;3 -> standard qdiv beq.b .qdiv blt.b .spec_2 asr.l #2,d4 ;4 -> scale by shifting right asr.l #2,d5 bra.b .mainloop .spec_2 asr.l #1,d4 ;2 -> scale by shifting right asr.l #1,d5 ENDC ****** Main drawing loop. Here lies the speed. ****** Very optimized (removed multiplication from inner loop) ****** d2 : spancount ****** d4 : sstep ****** d5 : tstep ****** d6 : s ****** d7 : t ****** a0 : pdest ****** a1 : pbase * do * { * *pdest++ = *(pbase + (s >> 16) + (t >> 16) * cachewidth); * s += sstep; * t += tstep; * } while (--spancount > 0); .mainloop move.l d1,-(sp) lea .PixTable,a6 ;a6 -> Functable move.l _cachewidth,d3 ;read cachewidth move.l 0(a6,d2.w*4),a6 ;get pointer to function swap d7 swap d4 move.l d7,d1 swap d5 muls d3,d7 ;d7 = t integer part * cachewidth move d5,d2 clr d1 ;d1 = t fractional part muls d3,d2 ;tstep integer part * cachewidth move d4,d0 ;d0 = sstep integer part clr d5 ;d5 = tstep fractional part clr d4 ;d4 = sstep fractional part swap d6 ;d6 = s swapped jmp (a6) .Pix8 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 ;increment s fractional part addx.w d0,d6 ;increment s integer part add.l d2,d7 ;increment t integer part add.l d5,d1 ;increment t fractional part bcc.b .Pix7 ;check if carry add.l d3,d7 ;add cachewidth to t .Pix7 lea 0(a1,d6.w),a6 ;and so long... move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix6 add.l d3,d7 .Pix6 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix5 add.l d3,d7 .Pix5 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix4 add.l d3,d7 .Pix4 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix3 add.l d3,d7 .Pix3 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix2 add.l d3,d7 .Pix2 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix1 add.l d3,d7 .Pix1 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix0 add.l d3,d7 .Pix0 move.l (sp)+,d1 ****** loop terminations move.l .saved5(sp),d7 ;t = tnext move.l .saved4(sp),d6 ;s = snext tst.l d1 ;while (count > 0) bgt.w .loop2 move.l .savea6(sp),a6 ;while ((pspan = pspan->next) != NULL) move.l PSPAN_NEXT(a6),a6 tst.l a6 bne.w .loop add.l #.fpuregs,sp fmovem.x (sp)+,fp2-fp7 movem.l (sp)+,d2-d7/a2-a6 rts .PixTable dc.l .Pix1 dc.l .Pix2 dc.l .Pix3 dc.l .Pix4 dc.l .Pix5 dc.l .Pix6 dc.l .Pix7 dc.l .Pix8 ****************************************************************************** * * void D_DrawSpans16 (espan_t *pspan) * * standard scan drawing function (16 pixel subdivision) * ****************************************************************************** cnop 0,4 _D_DrawSpans16 ***** stackframe rsreset .saved4 rs.l 1 .saved5 rs.l 1 .savea6 rs.l 1 .szstpu rs.s 1 .szstpv rs.s 1 .szorg rs.s 1 .tzstpu rs.s 1 .tzstpv rs.s 1 .tzorg rs.s 1 .zistpu rs.s 1 .zistpv rs.s 1 .ziorg rs.s 1 .fpuregs rs.x 6 .intregs rs.l 11 rs.l 1 .pspan rs.l 1 ****** Prologue. Global variables are put into registers or onto the stackframe movem.l d2-d7/a2-a6,-(sp) fmovem.x fp2-fp7,-(sp) move.l _bbextentt,a2 move.l _tadjust,a3 move.l _bbextents,a4 move.l _sadjust,a5 move.l _d_ziorigin,-(sp) move.l _d_zistepv,-(sp) move.l _d_zistepu,-(sp) move.l _d_tdivzorigin,-(sp) move.l _d_tdivzstepv,-(sp) move.l _d_tdivzstepu,-(sp) move.l _d_sdivzorigin,-(sp) move.l _d_sdivzstepv,-(sp) move.l _d_sdivzstepu,-(sp) sub.l #.szstpu,sp ****** First loop. In every iteration one complete span is drawn * pbase = (unsigned char *)cacheblock; * * sdivz16stepu = d_sdivzstepu * 16; * tdivz16stepu = d_tdivzstepu * 16; * zi16stepu = d_zistepu * 16; * * do * { * pdest = (unsigned char *)((byte *)d_viewbuffer + * (screenwidth * pspan->v) + pspan->u); * * count = pspan->count; * * // calculate the initial s/z, t/z, 1/z, s, and t and clamp * du = (float)pspan->u; * dv = (float)pspan->v; * * sdivz = d_sdivzorigin + dv*d_sdivzstepv + du*d_sdivzstepu; * tdivz = d_tdivzorigin + dv*d_tdivzstepv + du*d_tdivzstepu; * zi = d_ziorigin + dv*d_zistepv + du*d_zistepu; * z = (float)0x10000 / zi; // prescale to 16.16 fixed-point * move.l _cacheblock,a1 ;pbase = (unsigned char *)cacheblock fmove.s #16,fp7 fmove.s .szstpu(sp),fp3 fmul fp7,fp3 ;sdivz16stepu = d_sdivzstepu * 16 fmove.s .tzstpu(sp),fp4 fmul fp7,fp4 ;tdivz16stepu = d_tdivzstepu * 16 fmove.s .zistpu(sp),fp5 fmul fp7,fp5 ;zi16stepu = d_zistepu * 16 move.l .pspan(sp),a6 ;get function parameter .loop move.l a6,.savea6(sp) ;save actual ptr to pspan move.l _d_viewbuffer,a0 move.l _screenwidth,d0 move.l (a6)+,d1 fmove.l d1,fp2 ;du = (float)pspan->u move.l (a6)+,d2 fmove.l d2,fp7 ;dv = (float)pspan->v move.l (a6)+,d4 muls d2,d0 ;d0 = screenwidth * pspan->v add.l d1,d0 add.l d0,a0 ;pdest = d_viewbuffer + pspan->u + d0 lea .szstpu(sp),a6 ;a6 -> stackframe fmove.s (a6)+,fp0 fmul fp2,fp0 ;fp0 = du * d_sdivzstepu fmove.s (a6)+,fp1 fmul fp7,fp1 ;fp1 = dv * d_sdivzstepv fadd fp1,fp0 fadd.s (a6)+,fp0 ;sdivz = d_sdivzorigin + fp0 + fp1 fmove.s (a6)+,fp1 fmul fp2,fp1 ;fp1 = du * d_tdivzstepu fmove.s (a6)+,fp6 fmul fp7,fp6 ;fp6 = dv * d_tdivzstepv fadd fp6,fp1 fadd.s (a6)+,fp1 ;tdivz = d_tdivzorigin + fp1 + fp6 fmul.s (a6)+,fp2 ;fp2 = du * d_zistepu fmul.s (a6)+,fp7 ;fp7 = dv * d_zistepv fadd fp7,fp2 fadd.s (a6)+,fp2 ;zi = d_ziorigin + fp2 + fp7 fmove.s #65536,fp6 fdiv fp2,fp6 ;z = (float)0x10000 / zi * s = (int)(sdivz * z) + sadjust; * if (s > bbextents) * s = bbextents; * else if (s < 0) * s = 0; * * t = (int)(tdivz * z) + tadjust; * if (t > bbextentt) * t = bbextentt; * else if (t < 0) * t = 0; fmove fp6,fp7 fmul fp0,fp7 ;fp7 = sdivz * z fmove.l fp7,d6 ;convert to integer add.l a5,d6 ;s = d6 + sadjust cmp.l a4,d6 ;if (s > bbextents) bgt.b .down tst.l d6 ;if (s < 0) bge.b .keep .up moveq #0,d6 ;s = 0 bra.b .keep .down move.l a4,d6 ;s = bbextents .keep fmul fp1,fp6 ;fp6 = tdivz * z fmove.l fp6,d7 ;convert to integer add.l a3,d7 ;t = d7 + tadjust cmp.l a2,d7 ;if (t > bbextentt) bgt.b .down2 tst.l d7 ;if (t < 0) bge.b .keep2 .up2 moveq #0,d7 ;t = 0 bra.b .keep2 .down2 move.l a2,d7 ;t = bbextentt .keep2 move.l d4,d1 ****** Second loop. In every iteration one part of the whole span is drawn ****** d2 gets the value (spancount-1)! [NOT spancount] ****** d1 = count * do * { * // calculate s and t at the far end of the span * if (count >= 16) * spancount = 16; * else * spancount = count; * * count -= spancount; * * if (count) * { .loop2 moveq #16-1,d2 ;spancount = 16 cmp.l #16,d1 ;if (count >= 16) bgt.b .cont move.l d1,d2 ;spancount = count subq.l #1,d2 moveq #0,d1 ;count -= spancount bra.w .finalpart .cont sub.l #16,d1 ;count -= spancount; ****** Evaluation of the values for the inner loop. This version is used for ****** span size = 16 ****** a2 : bbextentt ****** a3 : tadjust ****** a4 : bbextents ****** a5 : sadjust ****** fp0 : sdivz ****** fp1 : tdivz ****** fp2 : zi ****** fp3 : sdivz16stepu ****** fp4 : tdivz16stepu ****** fp5 : zi16stepu * // calculate s/z, t/z, zi->fixed s and t at far end of span, * // calculate s and t steps across span by shifting * sdivz += sdivz16stepu; * tdivz += tdivz16stepu; * zi += zi16stepu; * z = (float)0x10000 / zi; // prescale to 16.16 fixed-point * snext = (int)(sdivz * z) + sadjust; * if (snext > bbextents) * snext = bbextents; * else if (snext < 16) * snext = 16; // prevent round-off error on <0 steps from * // from causing overstepping & running off the * // edge of the texture * tnext = (int)(tdivz * z) + tadjust; * if (tnext > bbextentt) * tnext = bbextentt; * else if (tnext < 16) * tnext = 16; // guard against round-off error on <0 steps * sstep = (snext - s) >> 4; * tstep = (tnext - t) >> 4; * } fadd fp3,fp0 ;sdivz += sdivz16stepu fadd fp4,fp1 ;tdivz += tdivz16stepu fadd fp5,fp2 ;zi += zi16stepu fmove.s #65536,fp7 fdiv fp2,fp7 ;z = (float)0x10000 / zi; fmove fp7,fp6 fmul fp0,fp6 ;fp2 = sdivz * z fmove.l fp6,d4 ;convert to integer add.l a5,d4 ;snext = d4 + sadjust cmp.l a4,d4 ;if (snext > bbextents) bgt.b .down3 cmp.l #16,d4 ;if (snext < 16) bge.b .keep3 .up3 moveq #16,d4 ;snext = 16 bra.b .keep3 .down3 move.l a4,d4 ;snext = bbextents .keep3 fmul fp1,fp7 ;fp7 = tdivz * z fmove.l fp7,d5 ;convert to integer add.l a3,d5 ;tnext = d5 + tadjust cmp.l a2,d5 ;if (tnext > bbextentt) bgt.b .down4 cmp.l #16,d5 ;if (tnext < 16) bge.b .keep4 .up4 moveq #16,d5 ;tnext = 16 bra.b .keep4 .down4 move.l a2,d5 ;tnext = bbextentt .keep4 move.l d4,.saved4(sp) ;save snext move.l d5,.saved5(sp) ;save tnext sub.l d6,d4 ;d4 = snext - s sub.l d7,d5 ;d5 = tnext - t asr.l #4,d4 ;sstep = d4 >> 4 asr.l #4,d5 ;tstep = d5 >> 4 bra.w .mainloop ****** Evaluation of the values for the inner loop. This version is used for ****** span size < 16 ****** The original algorithm has two ugly divisions at the end of this part. ****** These are removed by the following optimization: ****** First, the divisors 1,2 and 4 are handled specially to gain speed. The ****** other divisors are handled using a reciprocal table. ****** a2 : bbextentt ****** a3 : tadjust ****** a4 : bbextents ****** a5 : sadjust ****** fp0 : sdivz ****** fp1 : tdivz ****** fp2 : zi * // calculate s/z, t/z, zi->fixed s and t at last pixel in span (so * // can't step off polygon), clamp, calculate s and t steps across * // span by division, biasing steps low so we don't run off the * // texture * spancountminus1 = (float)(spancount - 1); * sdivz += d_sdivzstepu * spancountminus1; * tdivz += d_tdivzstepu * spancountminus1; * zi += d_zistepu * spancountminus1; * z = (float)0x10000 / zi; // prescale to 16.16 fixed-point * snext = (int)(sdivz * z) + sadjust; * if (snext > bbextents) * snext = bbextents; * else if (snext < 16) * snext = 16; // prevent round-off error on <0 steps from * // from causing overstepping & running off the * // edge of the texture * * tnext = (int)(tdivz * z) + tadjust; * if (tnext > bbextentt) * tnext = bbextentt; * else if (tnext < 16) * tnext = 16; // guard against round-off error on <0 steps * * if (spancount > 1) * { * sstep = (snext - s) / (spancount - 1); * tstep = (tnext - t) / (spancount - 1); * } * } .finalpart fmove.l d2,fp7 ;spancountminus1 = (float)(spancount-1) fmove fp7,fp6 fmul.s .szstpu(sp),fp6 ;fp6 = d_sdivzstepu * spancountminus1 fadd fp6,fp0 ;sdivz += fp6 fmove fp7,fp6 fmul.s .tzstpu(sp),fp6 ;fp6 = d_tdivzstepu * spancountminus1 fadd fp6,fp1 ;tdivz += fp6 fmul.s .zistpu(sp),fp7 ;fp7 = d_zistepu * spancountminus1 fadd fp7,fp2 ;zi += fp7 fmove.s #65536,fp7 fdiv fp2,fp7 ;z = (float)0x10000 / zi; fmove fp7,fp6 fmul fp0,fp6 ;fp6 = sdivz * z fmove.l fp6,d4 ;convert to integer add.l a5,d4 ;snext = d4 + sadjust cmp.l a4,d4 ;if (snext > bbextents) bgt.b .down5 cmp.l #16,d4 ;if (snext < 16) bge.b .keep5 .up5 moveq #16,d4 ;snext = 16 bra.b .keep5 .down5 move.l a4,d4 ;snext = bbextents .keep5 fmul fp1,fp7 ;fp7 = tdivz * z fmove.l fp7,d5 ;convert to integer add.l a3,d5 ;tnext = d5 + tadjust cmp.l a2,d5 ;if (tnext > bbextentt) bgt.b .down6 cmp.l #16,d5 ;if (tnext < 16) bge.b .keep6 .up6 moveq #16,d5 ;tnext = 16 bra.b .keep6 .down6 move.l a2,d5 ;tnext = bbextentt .keep6 move.l d4,.saved4(sp) ;save snext move.l d5,.saved5(sp) ;save tnext sub.l d6,d4 ;d4 = snext - s sub.l d7,d5 ;d5 = tnext - t IFEQ QDIV tst.l d2 beq.w .mainloop divs.l d2,d4 divs.l d2,d5 ELSEIF cmp #5,d2 ;(spancount-1) < 5? blt.b .special ;yes -> special case cmp #8,d2 beq.b .spec_8 .qdiv IFNE NICE_DIV lsl.l #2,d4 lsl.l #2,d5 lea ReciprocTable,a6 move 0(a6,d2.w*2),d0 move.l d4,d3 mulu d0,d3 clr d3 swap d3 swap d4 muls d0,d4 add.l d3,d4 move.l d5,d3 mulu d0,d3 clr d3 swap d3 swap d5 muls d0,d5 add.l d3,d5 bra.b .mainloop ELSEIF asr.l #7,d4 ;d4 >> 7 asr.l #7,d5 ;d5 >> 7 lea ReciprocTable,a6 ;a6 -> reciprocal table move 0(a6,d2.w*2),d0 ;d0 = (1/(spancount-1))<<16 muls d0,d4 ;d4 = d4 / (spancount-1) asr.l #7,d4 ;sstep = d4 >> 7 muls d0,d5 ;d5 = d5 / (spancount-1) asr.l #7,d5 ;tstep = d5 >> 7 bra.b .mainloop ENDC .special cmp #1,d2 ;switch (spancount-1) ble.b .mainloop ;0,1 -> no scaling needed cmp #3,d2 ;3 -> standard qdiv beq.b .qdiv blt.b .spec_2 asr.l #2,d4 ;4 -> scale by shifting right asr.l #2,d5 bra.b .mainloop .spec_8 asr.l #3,d4 ;8 -> scale by shifting right asr.l #3,d5 bra.b .mainloop .spec_2 asr.l #1,d4 ;2 -> scale by shifting right asr.l #1,d5 ENDC ****** Main drawing loop. Here lies the speed. ****** Very optimized (removed multiplication from inner loop) ****** d2 : spancount ****** d4 : sstep ****** d5 : tstep ****** d6 : s ****** d7 : t ****** a0 : pdest ****** a1 : pbase * do * { * *pdest++ = *(pbase + (s >> 16) + (t >> 16) * cachewidth); * s += sstep; * t += tstep; * } while (--spancount > 0); .mainloop move.l d1,-(sp) lea .PixTable,a6 ;a6 -> Functable move.l _cachewidth,d3 ;read cachewidth move.l 0(a6,d2.w*4),a6 ;get pointer to function swap d7 swap d4 move.l d7,d1 swap d5 muls d3,d7 ;d7 = t integer part * cachewidth move d5,d2 clr d1 ;d1 = t fractional part muls d3,d2 ;tstep integer part * cachewidth move d4,d0 ;d0 = sstep integer part clr d5 ;d5 = tstep fractional part clr d4 ;d4 = sstep fractional part swap d6 ;d6 = s swapped jmp (a6) .Pix16 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 ;increment s fractional part addx.w d0,d6 ;increment s integer part add.l d2,d7 ;increment t integer part add.l d5,d1 ;increment t fractional part bcc.b .Pix15 ;check if carry add.l d3,d7 ;add cachewidth to t .Pix15 lea 0(a1,d6.w),a6 ;and so long... move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix14 add.l d3,d7 .Pix14 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix13 add.l d3,d7 .Pix13 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix12 add.l d3,d7 .Pix12 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix11 add.l d3,d7 .Pix11 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix10 add.l d3,d7 .Pix10 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix9 add.l d3,d7 .Pix9 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix8 add.l d3,d7 .Pix8 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix7 add.l d3,d7 .Pix7 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix6 add.l d3,d7 .Pix6 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix5 add.l d3,d7 .Pix5 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix4 add.l d3,d7 .Pix4 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix3 add.l d3,d7 .Pix3 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix2 add.l d3,d7 .Pix2 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix1 add.l d3,d7 .Pix1 lea 0(a1,d6.w),a6 move.b 0(a6,d7.l),(a0)+ add.l d4,d6 addx.w d0,d6 add.l d2,d7 add.l d5,d1 bcc.b .Pix0 add.l d3,d7 .Pix0 move.l (sp)+,d1 ****** loop terminations move.l .saved5(sp),d7 ;t = tnext move.l .saved4(sp),d6 ;s = snext tst.l d1 ;while (count > 0) bgt.w .loop2 move.l .savea6(sp),a6 ;while ((pspan = pspan->next) != NULL) move.l PSPAN_NEXT(a6),a6 tst.l a6 bne.w .loop add.l #.fpuregs,sp fmovem.x (sp)+,fp2-fp7 movem.l (sp)+,d2-d7/a2-a6 rts .PixTable dc.l .Pix1 dc.l .Pix2 dc.l .Pix3 dc.l .Pix4 dc.l .Pix5 dc.l .Pix6 dc.l .Pix7 dc.l .Pix8 dc.l .Pix9 dc.l .Pix10 dc.l .Pix11 dc.l .Pix12 dc.l .Pix13 dc.l .Pix14 dc.l .Pix15 dc.l .Pix16 ****************************************************************************** * * void D_DrawZSpans (espan_t *pspan) * * standard z-scan drawing function * ****************************************************************************** cnop 0,4 _D_DrawZSpans ****** Prologue. Global variables are put into registers or onto the stack ***** stackframe rsreset .fpuregs rs.x 5 .intregs rs.l 7 rs.l 1 .pspan rs.l 1 movem.l d2-d7/a2,-(sp) fmovem.x fp3-fp7,-(sp) move.l .pspan(sp),a2 move.l _d_pzbuffer,a0 move.l _d_zwidth,d7 fmove.s _d_ziorigin,fp5 fmove.s _d_zistepv,fp6 fmove.s _d_zistepu,fp7 fmove.s #32768*65536,fp0 * izistep = (int)(d_zistepu * 0x8000 * 0x10000); fmove fp7,fp1 ;fp1 = d_zistepu fmul fp0,fp1 ;multiply by $8000*$10000 fmove.l fp1,d4 ;izistep = d4 moveq #16,d6 * pdest = d_pzbuffer + (d_zwidth * pspan->v) + pspan->u; * * count = pspan->count; * * // calculate the initial 1/z * du = (float)pspan->u; * dv = (float)pspan->v; * * zi = d_ziorigin + dv*d_zistepv + du*d_zistepu; * // we count on FP exceptions being turned off to avoid range problems * izi = (int)(zi * 0x8000 * 0x10000); .loop move.l (a2)+,d0 fmove fp7,fp4 fmul.l d0,fp4 ;fp4 = du * d_zistepu move.l (a2)+,d1 fmove fp6,fp3 fmul.l d1,fp3 ;fp3 = dv * d_zistepv move.l (a2)+,d2 fadd fp3,fp4 muls d7,d1 ;d1 = pspan->v * d_zwidth fadd fp5,fp4 ;fp4 = d_ziorigin + fp3 + fp4 add.l d0,d1 ;d1 = d1 + pspan->u lea 0(a0,d1.l*2),a1 ;pdest = d_pzbuffer + d1 fmul fp0,fp4 ;izi = zi * $8000 * $10000 fmove.l fp4,d3 ;convert to integer * if ((long)pdest & 0x02) * { * *pdest++ = (short)(izi >> 16); * izi += izistep; * count--; * } move.l a1,d0 ;if ((long)pdest & 0x02) and.l #2,d0 beq.b .cont swap d3 move d3,(a1)+ ;*pdest++ = (short)(izi>>16) swap d3 add.l d4,d3 ;izi += izistep; subq #1,d2 ;count-- .cont * if ((doublecount = count >> 1) > 0) * { * do * { * ltemp = izi >> 16; * izi += izistep; * ltemp |= izi & 0xFFFF0000; * izi += izistep; * *(int *)pdest = ltemp; * pdest += 2; * } while (--doublecount > 0); * } move.l d2,d0 ;if ((doublecount=count>>1)>0) asr.l #1,d0 ble.b .cont2 subq #1,d0 .loop2 move.l d3,d5 lsr.l d6,d5 ;temp = izi >> 16 add.l d4,d3 ;izi += izistep move.l d3,d1 and.l #$ffff0000,d1 or.l d1,d5 ;ltemp |= izi & 0xFFFF0000 add.l d4,d3 ;izi += izistep move.l d5,(a1)+ ;*(int *)pdest = ltemp dbra d0,.loop2 ;while (--doublecount > 0) .cont2 * if (count & 1) * *pdest = (short)(izi >> 16); and.l #$1,d2 ;if (count & 1) beq.b .cont3 swap d3 move d3,(a1)+ ;*pdest = (short)(izi >> 16) .cont3 * } while ((pspan = pspan->pnext) != NULL); move.l (a2)+,a2 tst.l a2 bne.w .loop fmovem.x (sp)+,fp3-fp7 movem.l (sp)+,d2-d7/a2 rts ReciprocTable dc.w 0 dc.w 0 dc.w 0 dc.w 16384/3 dc.w 0 dc.w 16384/5 dc.w 16384/6 dc.w 16384/7 dc.w 0 dc.w 16384/9 dc.w 16384/10 dc.w 16384/11 dc.w 16384/12 dc.w 16384/13 dc.w 16384/14 dc.w 16384/15 _SysBase dc.l 0