Magazyn Exec 4

home *** CD-ROM | disk | FTP | other *** search

/ Magazyn Exec 4 / CD_Magazyn_EXEC_nr_4.iso / Recent / util / boot / NewCMQ060.lha / NewCMQ060 / src / CMQ060.ASM

Wrap

Assembly Source File | 2000-09-11 | 24KB | 1,154 lines

; FILE: Source:CMQ060.ASM REV: 10 --- ultrafast CopyMemQuick060 ; History ; 0 Based on CMQ060 v1.4 by Dirk Busse. ; 1 Fixed major bug from the patch install: If the memory was ; allocated near 64k boundary CMQ060 trashed innocent memory. ; 2 Removed two pipeline stalls from bigcopy. Speedup for all big ; copies. ; 3 Optimized non-move16 copy loop, now it uses movem instead of ; move.l. Speedup for big unaligned copies. ; 4 Unrolled the bigcopy-loops to do 256 bytes per iteration. ; Speedup for all big copies. ; 5 Added MorphOS check, it makes no sense to slow down MorphOS with ; m68k patches. ; 6 Now pick move-loop for 68040 since it's faster (Thanks Chip!). ; 7 Added odd-movem-defect test to patch init: If the bus controller ; fail to supply correct data for odd-movem burst line read, will ; install patch without movem. Special thanks to Harald Frank who ; patiently explained the problem to me. ; 8 Oops, forgot to set up the special MMU & Cache setup in the odd- ; movem-defect test. Barfed with native ppc.library for example. ; 9 Fixed comments, made the source compile with PhxAss. ; 10 Reordered pflusha + movec dn,tc properly. ; include "exec/types.i" include "exec/libraries.i" include "exec/memory.i" include "exec/execbase.i" include "exec/exec_lib.i" call MACRO jsr (_LVO\1,a6) ENDM USE_MOVE16 EQU 1 ; use move16 ? SAFE_MOVE16 EQU 1 ; don't use move16 for 24bit mem ? USE_MOVEM EQU 1 ; use movem ? SAFE_MOVEM EQU 1 ; include odd-movem-defect test ? SPEEDTEST EQU 0 ; enable to speedtest IFNE SPEEDTEST _LVOSubTime EQU -$30 _LVOReadEClock EQU -$3C Main move.l (4).w,a6 call Forbid move.l d0,d7 lea (DeviceList,a6),a0 lea (.timername,pc),a1 call FindName move.l d0,d6 move.l #500000/100/2/2,d3 lea _src+0,a2 lea _dst+8,a3 ;move.l #65536+3,d2 ;move.l #32768,d2 ;move.l #3,d2 move.l #4096,d2 ;lea Quickest,a4 lea PatchStart,a4 subq.l #1,d7 beq.b .skip ;lea new_CopyMemQuick,a4 lea new_CopyMem,a4 .skip call CacheClearU exg d6,a6 lea (.start,pc),a0 call ReadEClock exg d6,a6 bra.b .loop CNOP 0,4 .loop REPT 16 move.l a2,a0 move.l a3,a1 move.l d2,d0 jsr (a4) ENDR subq.l #1,d3 bne .loop exg d6,a6 lea (.end,pc),a0 call ReadEClock exg d6,a6 call Permit move.l d0,d2 exg d6,a6 lea (.end,pc),a0 lea (.start,pc),a1 call SubTime exg d6,a6 move.l (.end+4,pc),d0 mulu.l #1000000,d1:d0 divu.l d2,d1:d0 rts .timername dc.b 'timer.device',0 CNOP 0,4 .start ds.l 2 .end ds.l 2 SECTION TEST,BSS _src ds.b 32768*2 _dst ds.b 32768*2 ds.b 32768 ; sanity SECTION B,CODE ELSE Main move.l (4).w,a6 btst #AFB_68040,(AttnFlags+1,a6) bne.b .ok040plus .nomem moveq #20,d0 rts dc.b '$VER: CMQ060' IFEQ SAFE_MOVE16 dc.b '_Move16' ENDC dc.b ' 1.5 (11.9.2000)',0 .morphosname dc.b 'MorphOS',0 CNOP 0,2 .ok040plus lea (.morphosname,pc),a1 call FindResident tst.l d0 beq.b .no_morphos moveq #5,d0 rts .no_morphos IFNE USE_MOVEM lea (.testcode,pc),a5 call Disable call Supervisor call Enable ; d0 = status ; movem routine defines lea (COPY,pc),a5 move.w #COPY_SIZEOF,d6 move.w #new_CopyMemQuick-COPY,d7 tst.l d0 ; can we use movem ? beq.b .use_movem ; move routine defines lea (COPY_MOVE,pc),a5 move.w #COPY_MOVE_SIZEOF,d6 move.w #new_CopyMemQuick_MOVE-COPY_MOVE,d7 .use_movem moveq #15,d0 moveq #MEMF_PUBLIC,d1 add.w d6,d0 moveq #8,d2 call AllocMem and.l d0,d2 add.l d2,d0 ; d0 = address aligned to 16 beq .nomem move.l a5,a0 move.l d0,a1 move.l d0,a2 move.w d6,d0 ; kindof a hack :-) bsr _smallcopy move.w #_LVOCopyMem,a0 ; some nice code reuse here... bsr.b .sfunc move.w #_LVOCopyMemQuick,a0 .sfunc move.l a2,d0 add.w d7,a2 move.l a6,a1 call SetFunction moveq #0,d0 rts .testcode ori.w #$0700,sr moveq #0,d0 tst.b (AttnFlags+1,a6) bmi.b .has_060 ; test 060 movec vbr,a0 lea (.illegal,pc),a1 move.l ($10,a0),-(sp) move.l ($2C,a0),-(sp) move.l a1,($10,a0) move.l a1,($2C,a0) cpusha bc cinva bc dc.w $4E7A,$1008 ; movec buscr,d1 dc.w $4E7A,$1808 ; movec pcr,d1 .exitillegal nop move.l (sp)+,($2C,a0) move.l (sp)+,($10,a0) cpusha bc cinva bc .has_060 IFNE SAFE_MOVEM ; figure out if to use movem or not ; --------------------------------- ; ; Use move-loop for 68040. ; ; For 68060 this code tries to trigger the ; movem-at-odd-address-burst-line-read bug of ; some CPU cards (early Cyberstorm models?). ; ; Some amiga 060 turbo cards don't have special ; hardware to handle writebuffer burst write ; -> undefined behavior. ; ; This routine will disable MMU mapping, set up ; transparent translation so that whole system ; memory is marked "non-cacheable, imprecise" ; and then probe whole system memory with 68060 ; Store Buffer, using skip of 512K. ; .testbeg tst.l d0 ; got 68040 ? bne .texit ; yep, don't bother movec itt0,d1 movec itt1,d2 movec dtt0,d3 movec dtt1,d4 movec cacr,d5 movec tc,d6 movem.l d1-d6/a6,-(sp) cpusha bc cinva bc move.l #$00FFC000,d1 ; LAmask=$ff E=%1 S=%10 (Ignore FC2 when matching), CM=%00 (Cacheable, Writethrough), W=%0 (R/W)1 move.l #$00FFC060,d2 ; LAmask=$ff E=%1 S=%10 (Ignore FC2 when matching), CM=%11 (Cache-Inhibited, Imprecise Exception Model)), W=%0 (R/W) move.l #(1<<31)|(1<<15),d5 ; EDC = 1 (Enable Data Cache), ESB = 0 (Disable Store Buffer), EIC = 1 (Enable Instruction Cache) and.w #~$8000,d6 ; Mask out E-bit movec d1,itt0 movec d1,itt1 movec d2,dtt0 movec d2,dtt1 movec d5,cacr pflusha movec d6,tc cpusha bc lea (MemList,a6),a4 move.l #$5555aaaa,d6 move.l #$aaaa5555,d7 .scan move.l (a4),a4 tst.l (a4) beq .scandone ; only PUBLIC memory areas btst #MEMB_PUBLIC,(MH_ATTRIBUTES+1,a4) beq.b .scan ; figure out scan area moveq #4+15,d0 moveq #-(12*4),d1 add.l (MH_LOWER,a4),d0 add.l (MH_UPPER,a4),d1 and.w #-16,d0 ; align to cacheline and.w #-16,d1 ; sanity check cmp.l d1,d0 bhs.b .scan subq.l #4,d0 ; get back one longword subq.l #4,d1 move.l d0,a6 ;line1 -+-------- line2 --------+-------- line3 --------+- line4 -- ; | | | ;2,$3333,$0000,$1111,$2222,$3333,$0000,$1111,$2222,$3333,$0000,$1111 movem.l d1/a4,-(sp) .testloop ; is the address inside this code? lea (.testend,pc),a0 ; testend < addr ? cmp.l a0,a6 bhi.b .not_inside lea (.testbeg,pc),a0 ; testbeg < addr ? cmp.l a0,a6 bls.b .inside .not_inside ; is the stack somewhere close? lea (8192,a7),a0 ; testend < addr ? cmp.l a0,a6 bhi.b .not_inside2 lea (-8192,a7),a0 ; testbeg < addr ? cmp.l a0,a6 bls.b .inside .not_inside2 ; backup the memory movem.l (a6),d0-d5/a0-a5 movem.l d0-d5/a0-a5,-(sp) cpusha dc ; turn on store buffer move.l #(1<<31)|(1<<29)|(1<<15),d0 ; EDC = 1 (Enable Data Cache), ESB = 1 (Enable Store Buffer), EIC = 1 (Enable Instruction Cache) movec d0,cacr addq.l #1,a6 bsr.b .anaali addq.l #2,a6 bsr.b .anaali subq.l #3,a6 ; turn off store buffer move.l #(1<<31)|(1<<15),d0 ; EDC = 1 (Enable Data Cache), ESB = 0 (Disable Store Buffer), EIC = 1 (Enable Instruction Cache) movec d0,cacr ; restore orig memory movem.l (sp)+,d0-d5/a0-a5 movem.l d0-d5/a0-a5,(a6) cpusha dc .inside ; advance 512K add.l #$80000,a6 cmp.l (sp),a6 ; (sp) = upper limit blo .testloop movem.l (sp)+,d1/a4 bra .scan .anaali bsr.b .flush ; write a test bitset ; make sure we actually do read the memory ; instead of the cache bsr.b .writeset ; read the test bitset back & analyze result bsr.b .anal ; trash the set... exg d6,d7 bsr.b .writeset exg d6,d7 ; write the bitset with movem move.l d6,a0 move.l d7,a1 move.l d6,a2 movem.l d0-d5/a0-a4,(a6) ; make sure we actually do read the memory ; instead of the cache bsr.b .flush ; read the test bitset back & analyze result bsr.b .anal ; must not bra! rts .writeset move.l d6,(a6) ; d0 move.l d7,(1*4,a6) ; d1 move.l d6,(2*4,a6) ; d2 move.l d7,(3*4,a6) ; d3 move.l d6,(4*4,a6) ; d4 move.l d7,(5*4,a6) ; d5 move.l d6,(6*4,a6) ; a0 move.l d7,(7*4,a6) ; a1 move.l d6,(8*4,a6) ; a2 move.l d7,(9*4,a6) ; a3 move.l d6,(10*4,a6) ; a4 .flush lea (4,a6),a2 lea (4+16,a6),a1 lea (4+32,a6),a0 cpushl dc,(a6) cpushl dc,(a2) cpushl dc,(a1) cpushl dc,(a0) rts .anal movem.l (a6),d0-d5/a0-a4 cmp.l d6,d0 bne.b .use_move cmp.l d7,d1 bne.b .use_move cmp.l d6,d2 bne.b .use_move cmp.l d7,d3 bne.b .use_move cmp.l d6,d4 bne.b .use_move cmp.l d7,d5 bne.b .use_move cmp.l d6,a0 bne.b .use_move cmp.l d7,a1 bne.b .use_move cmp.l d6,a2 bne.b .use_move cmp.l d7,a3 bne.b .use_move cmp.l d6,a4 beq.b .flush .use_move addq.l #2*4,sp ; pop return addresses ; align a6 to orig move.l a6,d0 and.w #-4,d0 move.l d0,a6 ; turn off store buffer move.l #(1<<31)|(1<<15),d0 ; EDC = 1 (Enable Data Cache), ESB = 0 (Disable Store Buffer), EIC = 1 (Enable Instruction Cache) movec d0,cacr ; restore orig memory movem.l (sp)+,d0-d5/a0-a5 movem.l d0-d5/a0-a5,(a6) cpusha dc addq.l #2*4,sp ; pop stack ; indicate move-mode moveq #1,d0 dc.w $51FA ; trapf.w #x .scandone moveq #0,d0 movem.l (sp)+,d1-d6/a6 cpusha bc movec d1,itt0 movec d2,itt1 movec d3,dtt0 movec d4,dtt1 movec d5,cacr pflusha movec d6,tc cpusha bc cinva bc .texit ENDC nop rte .illegal ; oops, this is a 68040 lea (.exitillegal,pc),a1 moveq #1,d0 move.l a1,(2,sp) nop rte .testend ELSE ; NOTE: d0.l = 0 IFD _PHXASS_ move.w #COPY_SIZEOF+8+7,d0 ELSE move.w #(COPY_SIZEOF+8+7)&-8,d0 ENDC moveq #MEMF_PUBLIC,d1 moveq #8,d2 call AllocMem and.l d0,d2 add.l d2,d0 ; d0 = address aligned to 16 beq.b .nomem lea (COPY,pc),a0 move.l d0,a1 move.l d0,a2 move.w #COPY_SIZEOF,d0 ; kindof a hack :-) bsr.b _smallcopy move.w #_LVOCopyMem,a0 ; some nice code reuse here... bsr.b .sfunc move.w #_LVOCopyMemQuick,a0 .sfunc move.l a2,d0 lea (new_CopyMemQuick-COPY,a2),a2 move.l a6,a1 call SetFunction moveq #0,d0 rts ENDC ENDC IFD _PHXASS_ CNOP 0,16 ENDC COPY new_CopyMem tst.l d0 ; zero len copy? beq.b _exit ; yep, quit move.w a1,d1 btst #0,d1 ; destination aligned by 2? beq.b .dst_align2 ; yep, try 4 move.b (a0)+,(a1)+ ; do the byte subq.l #1,d0 ; sub count beq.b _exit ; was the only byte, quit! move.w a1,d1 .dst_align2 btst #1,d1 ; destination aligned by 4? beq.b .dst_align4 ; yep, try big copy cmpi.l #1,d0 bne.b .not_byte .copy_byte move.b (a0)+,(a1)+ ; copy last byte rts .not_byte move.w (a0)+,(a1)+ ; do the word subq.l #2,d0 ; sub count beq.b _exit ; was the only word, quit! .dst_align4 ; now destination is longword aligned cmpi.l #(2048+16),d0 ; worth the trouble? bcc.b _bigcopy ; yep, do fast copy _smallcopy move.w d0,d1 lsr.w #2,d1 beq.b .nolongs .copy move.l (a0)+,(a1)+ subq.w #1,d1 bne.b .copy .nolongs btst #1,d0 ; long copy done, one word left? beq.b .no_last_word ; nope move.w (a0)+,(a1)+ ; copy last word .no_last_word btst #0,d0 ; one byte left? beq.b .no_last_byte ; nope move.b (a0)+,(a1)+ ; copy last byte .no_last_byte _exit rts _bigcopy IFNE USE_MOVE16 IFNE SAFE_MOVE16 cmp.l #$01000000,a0 bcs.b .bigcopy_nomove16 move.w a1,d1 cmp.l #$01000000,a1 bcs.b .bigcopy_nomove16 ELSE move.w a1,d1 ENDC btst #2,d1 ; destination aligned by 8? beq.b .dst_align8 ; yep, try 16 move.l (a0)+,(a1)+ ; nope, make it! addq.w #4,d1 subq.l #4,d0 .dst_align8 btst #3,d1 ; destination aligned by 16? beq.b .dst_align16 ; yep, try source move.l (a0)+,(a1)+ ; nope, make it! move.l (a0)+,(a1)+ subq.l #8,d0 .dst_align16 move.w a0,d1 and.w #15,d1 ; source aligned by 16? bne.b .bigcopy_nomove16 ; nope, so lets use normal copy ; source: aligned by 16 ; destination: aligned by 16 move.l d0,d1 lsr.l #8,d1 ; copy 256 bytes per go .copy256 REPT 16 move16 (a0)+,(a1)+ ENDR subq.l #1,d1 bne.b .copy256 and.w #255,d0 ; handle rest if needed bne _smallcopy rts .bigcopy_nomove16 ENDC IFNE USE_MOVEM ; unrolled movem.l loop, using 12 registers movem.l d0/d2-d7/a2-a6,-(sp) lsr.l #8,d0 ; copy 256 bytes per go .mcopy256 movem.l (a0)+,d1-d7/a2-a6 ; 12*4=48 movem.l d1-d7/a2-a6,(a1) movem.l (a0)+,d1-d7/a2-a6 ; 12*4=48 movem.l d1-d7/a2-a6,(1*48,a1) movem.l (a0)+,d1-d7/a2-a6 ; 12*4=48 movem.l d1-d7/a2-a6,(2*48,a1) movem.l (a0)+,d1-d7/a2-a6 ; 12*4=48 movem.l d1-d7/a2-a6,(3*48,a1) movem.l (a0)+,d1-d7/a2-a6 ; 12*4=48 movem.l d1-d7/a2-a6,(4*48,a1) movem.l (a0)+,d1-d4 ; 4*4=16 movem.l d1-d4,(5*48,a1) subq.l #1,d0 lea (256,a1),a1 bne.b .mcopy256 movem.l (sp)+,d0/d2-d7/a2-a6 and.w #255,d0 ; handle rest if needed ELSE ; unrolled move.l loop move.l d0,d1 lsr.l #6,d1 ; copy 64 bytes per go .mcopy64 REPT 16 move.l (a0)+,(a1)+ ENDR subq.l #1,d1 bne.b .mcopy64 and.w #63,d0 ; handle rest if needed ENDC bne _smallcopy rts IFD _PHXASS_ CNOP 0,16 ELSE IFGT 16-((*-COPY)&15) dcb.w (16-((*-COPY)&15))>>1,$51FC ENDC ENDC new_CopyMemQuick lsr.l #2,d0 ; bytes -> longswords cmpi.l #(2048+16)>>2,d0 ; worth the trouble? bcc.b .bigcopy ; yep, do fast copy tst.w d0 beq.b .exit ; nothing to copy, quit! .smallcopy .copy move.l (a0)+,(a1)+ subq.w #1,d0 bne.b .copy .exit rts .bigcopy IFNE USE_MOVE16 IFNE SAFE_MOVE16 cmp.l #$01000000,a0 bcs.b .bigcopy_nomove16 move.w a1,d1 cmp.l #$01000000,a1 bcs.b .bigcopy_nomove16 ELSE move.w a1,d1 ENDC btst #2,d1 ; destination aligned by 8? beq.b .dst_align8 ; yep, try 16 move.l (a0)+,(a1)+ ; nope, make it! addq.w #4,d1 subq.l #4>>2,d0 .dst_align8 btst #3,d1 ; destination aligned by 16? beq.b .dst_align16 ; yep, try source move.l (a0)+,(a1)+ ; nope, make it! move.l (a0)+,(a1)+ subq.l #8>>2,d0 .dst_align16 move.w a0,d1 and.w #15,d1 ; source aligned by 16? bne.b .bigcopy_nomove16 ; nope, so lets use normal copy ; source: aligned by 16 ; destination: aligned by 16 move.l d0,d1 lsr.l #8-2,d1 ; copy 256 bytes per go .copy256 REPT 16 move16 (a0)+,(a1)+ ENDR subq.l #1,d1 bne.b .copy256 and.w #255>>2,d0 ; handle rest if needed bne .smallcopy rts .bigcopy_nomove16 ENDC IFNE USE_MOVEM ; unrolled movem.l loop, using 12 registers movem.l d0/d2-d7/a2-a6,-(sp) lsr.l #8-2,d0 ; copy 256 bytes per go .mcopy256 movem.l (a0)+,d1-d7/a2-a6 ; 12*4=48 movem.l d1-d7/a2-a6,(a1) movem.l (a0)+,d1-d7/a2-a6 ; 12*4=48 movem.l d1-d7/a2-a6,(1*48,a1) movem.l (a0)+,d1-d7/a2-a6 ; 12*4=48 movem.l d1-d7/a2-a6,(2*48,a1) movem.l (a0)+,d1-d7/a2-a6 ; 12*4=48 movem.l d1-d7/a2-a6,(3*48,a1) movem.l (a0)+,d1-d7/a2-a6 ; 12*4=48 movem.l d1-d7/a2-a6,(4*48,a1) movem.l (a0)+,d1-d4 ; 4*4=16 movem.l d1-d4,(5*48,a1) subq.l #1,d0 lea (256,a1),a1 bne.b .mcopy256 movem.l (sp)+,d0/d2-d7/a2-a6 and.w #255>>2,d0 ; handle rest if needed ELSE ; unrolled move.l loop move.l d0,d1 lsr.l #6-2,d1 ; copy 64 bytes per go .mcopy64 REPT 16 move.l (a0)+,(a1)+ ENDR subq.l #1,d1 bne.b .mcopy64 and.w #63>>2,d0 ; handle rest if needed ENDC bne .smallcopy rts COPY_SIZEOF EQU (*-COPY) IFNE USE_MOVEM&SAFE_MOVEM IFD _PHXASS_ CNOP 0,16 ENDC COPY_MOVE new_CopyMem_MOVE tst.l d0 ; zero len copy? beq.b .exit ; yep, quit move.w a1,d1 btst #0,d1 ; destination aligned by 2? beq.b .dst_align2 ; yep, try 4 move.b (a0)+,(a1)+ ; do the byte subq.l #1,d0 ; sub count beq.b .exit ; was the only byte, quit! move.w a1,d1 .dst_align2 btst #1,d1 ; destination aligned by 4? beq.b .dst_align4 ; yep, try big copy cmpi.l #1,d0 bne.b .not_byte .copy_byte move.b (a0)+,(a1)+ ; copy last byte rts .not_byte move.w (a0)+,(a1)+ ; do the word subq.l #2,d0 ; sub count beq.b .exit ; was the only word, quit! .dst_align4 ; now destination is longword aligned cmpi.l #(2048+16),d0 ; worth the trouble? bcc.b .bigcopy ; yep, do fast copy .smallcopy move.w d0,d1 lsr.w #2,d1 beq.b .nolongs .copy move.l (a0)+,(a1)+ subq.w #1,d1 bne.b .copy .nolongs btst #1,d0 ; long copy done, one word left? beq.b .no_last_word ; nope move.w (a0)+,(a1)+ ; copy last word .no_last_word btst #0,d0 ; one byte left? beq.b .no_last_byte ; nope move.b (a0)+,(a1)+ ; copy last byte .no_last_byte .exit rts .bigcopy IFNE USE_MOVE16 IFNE SAFE_MOVE16 cmp.l #$01000000,a0 bcs.b .bigcopy_nomove16 move.w a1,d1 cmp.l #$01000000,a1 bcs.b .bigcopy_nomove16 ELSE move.w a1,d1 ENDC btst #2,d1 ; destination aligned by 8? beq.b .dst_align8 ; yep, try 16 move.l (a0)+,(a1)+ ; nope, make it! addq.w #4,d1 subq.l #4,d0 .dst_align8 btst #3,d1 ; destination aligned by 16? beq.b .dst_align16 ; yep, try source move.l (a0)+,(a1)+ ; nope, make it! move.l (a0)+,(a1)+ subq.l #8,d0 .dst_align16 move.w a0,d1 and.w #15,d1 ; source aligned by 16? bne.b .bigcopy_nomove16 ; nope, so lets use normal copy ; source: aligned by 16 ; destination: aligned by 16 move.l d0,d1 lsr.l #8,d1 ; copy 256 bytes per go .copy256 REPT 16 move16 (a0)+,(a1)+ ENDR subq.l #1,d1 bne.b .copy256 and.w #255,d0 ; handle rest if needed bne .smallcopy rts .bigcopy_nomove16 ENDC ; unrolled move.l loop move.l d0,d1 lsr.l #6,d1 ; copy 64 bytes per go .mcopy64 REPT 16 move.l (a0)+,(a1)+ ENDR subq.l #1,d1 bne.b .mcopy64 and.w #63,d0 ; handle rest if needed bne .smallcopy rts IFD _PHXASS_ CNOP 0,16 ELSE IFGT 16-((*-COPY_MOVE)&15) dcb.w (16-((*-COPY_MOVE)&15))>>1,$51FC ENDC ENDC new_CopyMemQuick_MOVE lsr.l #2,d0 ; bytes -> longswords cmpi.l #(2048+16)>>2,d0 ; worth the trouble? bcc.b .bigcopy ; yep, do fast copy tst.w d0 beq.b .exit ; nothing to copy, quit! .smallcopy .copy move.l (a0)+,(a1)+ subq.w #1,d0 bne.b .copy .exit rts .bigcopy IFNE USE_MOVE16 IFNE SAFE_MOVE16 cmp.l #$01000000,a0 bcs.b .bigcopy_nomove16 move.w a1,d1 cmp.l #$01000000,a1 bcs.b .bigcopy_nomove16 ELSE move.w a1,d1 ENDC btst #2,d1 ; destination aligned by 8? beq.b .dst_align8 ; yep, try 16 move.l (a0)+,(a1)+ ; nope, make it! addq.w #4,d1 subq.l #4>>2,d0 .dst_align8 btst #3,d1 ; destination aligned by 16? beq.b .dst_align16 ; yep, try source move.l (a0)+,(a1)+ ; nope, make it! move.l (a0)+,(a1)+ subq.l #8>>2,d0 .dst_align16 move.w a0,d1 and.w #15,d1 ; source aligned by 16? bne.b .bigcopy_nomove16 ; nope, so lets use normal copy ; source: aligned by 16 ; destination: aligned by 16 move.l d0,d1 lsr.l #8-2,d1 ; copy 256 bytes per go .copy256 REPT 16 move16 (a0)+,(a1)+ ENDR subq.l #1,d1 bne.b .copy256 and.w #255>>2,d0 ; handle rest if needed bne .smallcopy rts .bigcopy_nomove16 ENDC ; unrolled move.l loop move.l d0,d1 lsr.l #6-2,d1 ; copy 64 bytes per go .mcopy64 REPT 16 move.l (a0)+,(a1)+ ENDR subq.l #1,d1 bne.b .mcopy64 and.w #63>>2,d0 ; handle rest if needed bne .smallcopy rts COPY_MOVE_SIZEOF EQU (*-COPY_MOVE) ENDC IFNE SPEEDTEST SECTION C,CODE ******************************************* * CopyMemQuicker060 * * Uses Move16 only in Adresses >$00ffffff * * * * Include File V1.4 * * written by Dirk Busse * * 3. Apr. 1999 * ******************************************* PatchStart tst.l d0 beq.b squit move.w a1,d1 ;Zieladresse nach d1 (nur die ersten zwei Bits werden benötigt) btst #0,d1 beq.b .skip1 move.b (a0)+,(a1)+ ;Zieladresse auf EVEN setzen subq.l #1,d0 beq.b squit move.w a1,d1 .skip1 btst #1,d1 beq.b .skip2 cmpi.l #1,d0 bne.b .two move.b (a0)+,(a1)+ rts ; cnop 0,4 .two move.w (a0)+,(a1)+ ;Zieladresse auf LONG setzen subq.l #2,d0 beq.b squit .skip2 cmpi.l #%0000100000010000,d0 ;prüfen, ob d0 >=2064 bcc.b bigmove smlmove move.l d0,d1 lsr.w #2,d1 ;d0 ist hier nur noch maximal 12Bit lang beq.b .nolong .loop move.l (a0)+,(a1)+ subq.w #1,d1 bne.b .loop .nolong btst #1,d0 beq.b .skip move.w (a0)+,(a1)+ ;letztes Word uebertragen .skip btst #0,d0 beq.b squit move.b (a0)+,(a1)+ ;letztes Byte uebertragen squit rts ; cnop 0,4 bigmove cmp.l #$01000000,a0 ;Ist Quelladresse >$00ffffff ? bcs.b bigmov cmp.l #$01000000,a1 ;Ist Zieladresse >$00ffffff ? bcs.b bigmov move.w a1,d1 btst #2,d1 beq.b .skip1 subq.l #%100,d0 ;Zieladresse auf 8-Byte-Block legen move.l (a0)+,(a1)+ move.w a1,d1 .skip1 btst #3,d1 beq.b .dest16 subq.l #%1000,d0 ;Zieladresse auf 16-Byte-Block legen move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ .dest16 move.w a0,d1 andi.b #15,d1 bne.b bigmov ;wenn Quelladresse nicht im 16-Byte-Block liegt move.l d0,d1 lsr.l #7,d1 .loop move16 (a0)+,(a1)+ move16 (a0)+,(a1)+ move16 (a0)+,(a1)+ move16 (a0)+,(a1)+ move16 (a0)+,(a1)+ move16 (a0)+,(a1)+ move16 (a0)+,(a1)+ move16 (a0)+,(a1)+ subq.l #1,d1 bne.b .loop and.w #%0000000001111111,d0 ;die oberen 16 Bits werden ab hier nichtmehr berücksichtigt bne.b smlmove rts ; cnop 0,4 bigmov move.l d0,d1 lsr.l #6,d1 .loop move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ subq.l #1,d1 bne.b .loop and.w #%0000000000111111,d0 ;die oberen 16 Bits werden ab hier nichtmehr berücksichtigt bne smlmove rts cnop 0,4 ************************************************************************** Quickest lsr.l #2,d0 cmpi.l #%0000001000000100,d0 ;prüfen, ob d0 >=2064/4 bcc.b bigmo16 smlmovQ tst.w d0 ;d0 ist hier nur noch maximal 10 Bit lang beq.b squitQ sloopQ move.l (a0)+,(a1)+ subq.w #1,d0 bne.b sloopQ squitQ rts ; cnop 0,4 bigmo16 cmp.l #$01000000,a0 ;Ist Quelladresse >$00ffffff ? bcs.b bigmovQ cmp.l #$01000000,a1 ;Ist Zieladresse >$00ffffff ? bcs.b bigmovQ move.w a1,d1 btst #2,d1 beq.b .skip1 subq.l #1,d0 ;Zieladresse auf 8-Byte-Block legen move.l (a0)+,(a1)+ move.w a1,d1 .skip1 btst #3,d1 beq.b .dest16 subq.l #2,d0 ;Zieladresse auf 16-Byte-Block legen move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ .dest16 move.w a0,d1 andi.b #15,d1 bne.b bigmovQ ;wenn Quelladresse nicht im 16-Byte-Block liegt move.l d0,d1 lsr.l #5,d1 .loop move16 (a0)+,(a1)+ move16 (a0)+,(a1)+ move16 (a0)+,(a1)+ move16 (a0)+,(a1)+ move16 (a0)+,(a1)+ move16 (a0)+,(a1)+ move16 (a0)+,(a1)+ move16 (a0)+,(a1)+ subq.l #1,d1 bne.b .loop and.w #%0000000000011111,d0 ;die oberen 16 Bits werden ab hier nichtmehr berücksichtigt bne.b sloopQ rts ; cnop 0,4 bigmovQ move.l d0,d1 lsr.l #4,d1 .loop move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ move.l (a0)+,(a1)+ subq.l #1,d1 bne.b .loop and.w #%0000000000001111,d0 ;die oberen 16 Bits werden ab hier nichtmehr berücksichtigt bne sloopQ rts cnop 0,4 PatchEnd ENDC