home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Magazyn Exec 4
/
CD_Magazyn_EXEC_nr_4.iso
/
Recent
/
util
/
boot
/
NewCMQ060.lha
/
NewCMQ060
/
src
/
CMQ060.ASM
Wrap
Assembly Source File
|
2000-09-11
|
24KB
|
1,154 lines
; FILE: Source:CMQ060.ASM REV: 10 --- ultrafast CopyMemQuick060
; History
; 0 Based on CMQ060 v1.4 by Dirk Busse.
; 1 Fixed major bug from the patch install: If the memory was
; allocated near 64k boundary CMQ060 trashed innocent memory.
; 2 Removed two pipeline stalls from bigcopy. Speedup for all big
; copies.
; 3 Optimized non-move16 copy loop, now it uses movem instead of
; move.l. Speedup for big unaligned copies.
; 4 Unrolled the bigcopy-loops to do 256 bytes per iteration.
; Speedup for all big copies.
; 5 Added MorphOS check, it makes no sense to slow down MorphOS with
; m68k patches.
; 6 Now pick move-loop for 68040 since it's faster (Thanks Chip!).
; 7 Added odd-movem-defect test to patch init: If the bus controller
; fail to supply correct data for odd-movem burst line read, will
; install patch without movem. Special thanks to Harald Frank who
; patiently explained the problem to me.
; 8 Oops, forgot to set up the special MMU & Cache setup in the odd-
; movem-defect test. Barfed with native ppc.library for example.
; 9 Fixed comments, made the source compile with PhxAss.
; 10 Reordered pflusha + movec dn,tc properly.
;
include "exec/types.i"
include "exec/libraries.i"
include "exec/memory.i"
include "exec/execbase.i"
include "exec/exec_lib.i"
call MACRO
jsr (_LVO\1,a6)
ENDM
USE_MOVE16 EQU 1 ; use move16 ?
SAFE_MOVE16 EQU 1 ; don't use move16 for 24bit mem ?
USE_MOVEM EQU 1 ; use movem ?
SAFE_MOVEM EQU 1 ; include odd-movem-defect test ?
SPEEDTEST EQU 0 ; enable to speedtest
IFNE SPEEDTEST
_LVOSubTime EQU -$30
_LVOReadEClock EQU -$3C
Main move.l (4).w,a6
call Forbid
move.l d0,d7
lea (DeviceList,a6),a0
lea (.timername,pc),a1
call FindName
move.l d0,d6
move.l #500000/100/2/2,d3
lea _src+0,a2
lea _dst+8,a3
;move.l #65536+3,d2
;move.l #32768,d2
;move.l #3,d2
move.l #4096,d2
;lea Quickest,a4
lea PatchStart,a4
subq.l #1,d7
beq.b .skip
;lea new_CopyMemQuick,a4
lea new_CopyMem,a4
.skip
call CacheClearU
exg d6,a6
lea (.start,pc),a0
call ReadEClock
exg d6,a6
bra.b .loop
CNOP 0,4
.loop
REPT 16
move.l a2,a0
move.l a3,a1
move.l d2,d0
jsr (a4)
ENDR
subq.l #1,d3
bne .loop
exg d6,a6
lea (.end,pc),a0
call ReadEClock
exg d6,a6
call Permit
move.l d0,d2
exg d6,a6
lea (.end,pc),a0
lea (.start,pc),a1
call SubTime
exg d6,a6
move.l (.end+4,pc),d0
mulu.l #1000000,d1:d0
divu.l d2,d1:d0
rts
.timername dc.b 'timer.device',0
CNOP 0,4
.start ds.l 2
.end ds.l 2
SECTION TEST,BSS
_src ds.b 32768*2
_dst ds.b 32768*2
ds.b 32768 ; sanity
SECTION B,CODE
ELSE
Main move.l (4).w,a6
btst #AFB_68040,(AttnFlags+1,a6)
bne.b .ok040plus
.nomem moveq #20,d0
rts
dc.b '$VER: CMQ060'
IFEQ SAFE_MOVE16
dc.b '_Move16'
ENDC
dc.b ' 1.5 (11.9.2000)',0
.morphosname
dc.b 'MorphOS',0
CNOP 0,2
.ok040plus
lea (.morphosname,pc),a1
call FindResident
tst.l d0
beq.b .no_morphos
moveq #5,d0
rts
.no_morphos
IFNE USE_MOVEM
lea (.testcode,pc),a5
call Disable
call Supervisor
call Enable
; d0 = status
; movem routine defines
lea (COPY,pc),a5
move.w #COPY_SIZEOF,d6
move.w #new_CopyMemQuick-COPY,d7
tst.l d0 ; can we use movem ?
beq.b .use_movem
; move routine defines
lea (COPY_MOVE,pc),a5
move.w #COPY_MOVE_SIZEOF,d6
move.w #new_CopyMemQuick_MOVE-COPY_MOVE,d7
.use_movem
moveq #15,d0
moveq #MEMF_PUBLIC,d1
add.w d6,d0
moveq #8,d2
call AllocMem
and.l d0,d2
add.l d2,d0 ; d0 = address aligned to 16
beq .nomem
move.l a5,a0
move.l d0,a1
move.l d0,a2
move.w d6,d0 ; kindof a hack :-)
bsr _smallcopy
move.w #_LVOCopyMem,a0 ; some nice code reuse here...
bsr.b .sfunc
move.w #_LVOCopyMemQuick,a0
.sfunc move.l a2,d0
add.w d7,a2
move.l a6,a1
call SetFunction
moveq #0,d0
rts
.testcode
ori.w #$0700,sr
moveq #0,d0
tst.b (AttnFlags+1,a6)
bmi.b .has_060
; test 060
movec vbr,a0
lea (.illegal,pc),a1
move.l ($10,a0),-(sp)
move.l ($2C,a0),-(sp)
move.l a1,($10,a0)
move.l a1,($2C,a0)
cpusha bc
cinva bc
dc.w $4E7A,$1008 ; movec buscr,d1
dc.w $4E7A,$1808 ; movec pcr,d1
.exitillegal
nop
move.l (sp)+,($2C,a0)
move.l (sp)+,($10,a0)
cpusha bc
cinva bc
.has_060
IFNE SAFE_MOVEM
; figure out if to use movem or not
; ---------------------------------
;
; Use move-loop for 68040.
;
; For 68060 this code tries to trigger the
; movem-at-odd-address-burst-line-read bug of
; some CPU cards (early Cyberstorm models?).
;
; Some amiga 060 turbo cards don't have special
; hardware to handle writebuffer burst write
; -> undefined behavior.
;
; This routine will disable MMU mapping, set up
; transparent translation so that whole system
; memory is marked "non-cacheable, imprecise"
; and then probe whole system memory with 68060
; Store Buffer, using skip of 512K.
;
.testbeg
tst.l d0 ; got 68040 ?
bne .texit ; yep, don't bother
movec itt0,d1
movec itt1,d2
movec dtt0,d3
movec dtt1,d4
movec cacr,d5
movec tc,d6
movem.l d1-d6/a6,-(sp)
cpusha bc
cinva bc
move.l #$00FFC000,d1 ; LAmask=$ff E=%1 S=%10 (Ignore FC2 when matching), CM=%00 (Cacheable, Writethrough), W=%0 (R/W)1
move.l #$00FFC060,d2 ; LAmask=$ff E=%1 S=%10 (Ignore FC2 when matching), CM=%11 (Cache-Inhibited, Imprecise Exception Model)), W=%0 (R/W)
move.l #(1<<31)|(1<<15),d5 ; EDC = 1 (Enable Data Cache), ESB = 0 (Disable Store Buffer), EIC = 1 (Enable Instruction Cache)
and.w #~$8000,d6 ; Mask out E-bit
movec d1,itt0
movec d1,itt1
movec d2,dtt0
movec d2,dtt1
movec d5,cacr
pflusha
movec d6,tc
cpusha bc
lea (MemList,a6),a4
move.l #$5555aaaa,d6
move.l #$aaaa5555,d7
.scan move.l (a4),a4
tst.l (a4)
beq .scandone
; only PUBLIC memory areas
btst #MEMB_PUBLIC,(MH_ATTRIBUTES+1,a4)
beq.b .scan
; figure out scan area
moveq #4+15,d0
moveq #-(12*4),d1
add.l (MH_LOWER,a4),d0
add.l (MH_UPPER,a4),d1
and.w #-16,d0 ; align to cacheline
and.w #-16,d1
; sanity check
cmp.l d1,d0
bhs.b .scan
subq.l #4,d0 ; get back one longword
subq.l #4,d1
move.l d0,a6
;line1 -+-------- line2 --------+-------- line3 --------+- line4 --
; | | |
;2,$3333,$0000,$1111,$2222,$3333,$0000,$1111,$2222,$3333,$0000,$1111
movem.l d1/a4,-(sp)
.testloop
; is the address inside this code?
lea (.testend,pc),a0 ; testend < addr ?
cmp.l a0,a6
bhi.b .not_inside
lea (.testbeg,pc),a0 ; testbeg < addr ?
cmp.l a0,a6
bls.b .inside
.not_inside
; is the stack somewhere close?
lea (8192,a7),a0 ; testend < addr ?
cmp.l a0,a6
bhi.b .not_inside2
lea (-8192,a7),a0 ; testbeg < addr ?
cmp.l a0,a6
bls.b .inside
.not_inside2
; backup the memory
movem.l (a6),d0-d5/a0-a5
movem.l d0-d5/a0-a5,-(sp)
cpusha dc
; turn on store buffer
move.l #(1<<31)|(1<<29)|(1<<15),d0 ; EDC = 1 (Enable Data Cache), ESB = 1 (Enable Store Buffer), EIC = 1 (Enable Instruction Cache)
movec d0,cacr
addq.l #1,a6
bsr.b .anaali
addq.l #2,a6
bsr.b .anaali
subq.l #3,a6
; turn off store buffer
move.l #(1<<31)|(1<<15),d0 ; EDC = 1 (Enable Data Cache), ESB = 0 (Disable Store Buffer), EIC = 1 (Enable Instruction Cache)
movec d0,cacr
; restore orig memory
movem.l (sp)+,d0-d5/a0-a5
movem.l d0-d5/a0-a5,(a6)
cpusha dc
.inside
; advance 512K
add.l #$80000,a6
cmp.l (sp),a6 ; (sp) = upper limit
blo .testloop
movem.l (sp)+,d1/a4
bra .scan
.anaali
bsr.b .flush
; write a test bitset
; make sure we actually do read the memory
; instead of the cache
bsr.b .writeset
; read the test bitset back & analyze result
bsr.b .anal
; trash the set...
exg d6,d7
bsr.b .writeset
exg d6,d7
; write the bitset with movem
move.l d6,a0
move.l d7,a1
move.l d6,a2
movem.l d0-d5/a0-a4,(a6)
; make sure we actually do read the memory
; instead of the cache
bsr.b .flush
; read the test bitset back & analyze result
bsr.b .anal ; must not bra!
rts
.writeset move.l d6,(a6) ; d0
move.l d7,(1*4,a6) ; d1
move.l d6,(2*4,a6) ; d2
move.l d7,(3*4,a6) ; d3
move.l d6,(4*4,a6) ; d4
move.l d7,(5*4,a6) ; d5
move.l d6,(6*4,a6) ; a0
move.l d7,(7*4,a6) ; a1
move.l d6,(8*4,a6) ; a2
move.l d7,(9*4,a6) ; a3
move.l d6,(10*4,a6) ; a4
.flush lea (4,a6),a2
lea (4+16,a6),a1
lea (4+32,a6),a0
cpushl dc,(a6)
cpushl dc,(a2)
cpushl dc,(a1)
cpushl dc,(a0)
rts
.anal movem.l (a6),d0-d5/a0-a4
cmp.l d6,d0
bne.b .use_move
cmp.l d7,d1
bne.b .use_move
cmp.l d6,d2
bne.b .use_move
cmp.l d7,d3
bne.b .use_move
cmp.l d6,d4
bne.b .use_move
cmp.l d7,d5
bne.b .use_move
cmp.l d6,a0
bne.b .use_move
cmp.l d7,a1
bne.b .use_move
cmp.l d6,a2
bne.b .use_move
cmp.l d7,a3
bne.b .use_move
cmp.l d6,a4
beq.b .flush
.use_move
addq.l #2*4,sp ; pop return addresses
; align a6 to orig
move.l a6,d0
and.w #-4,d0
move.l d0,a6
; turn off store buffer
move.l #(1<<31)|(1<<15),d0 ; EDC = 1 (Enable Data Cache), ESB = 0 (Disable Store Buffer), EIC = 1 (Enable Instruction Cache)
movec d0,cacr
; restore orig memory
movem.l (sp)+,d0-d5/a0-a5
movem.l d0-d5/a0-a5,(a6)
cpusha dc
addq.l #2*4,sp ; pop stack
; indicate move-mode
moveq #1,d0
dc.w $51FA ; trapf.w #x
.scandone
moveq #0,d0
movem.l (sp)+,d1-d6/a6
cpusha bc
movec d1,itt0
movec d2,itt1
movec d3,dtt0
movec d4,dtt1
movec d5,cacr
pflusha
movec d6,tc
cpusha bc
cinva bc
.texit
ENDC
nop
rte
.illegal
; oops, this is a 68040
lea (.exitillegal,pc),a1
moveq #1,d0
move.l a1,(2,sp)
nop
rte
.testend
ELSE
; NOTE: d0.l = 0
IFD _PHXASS_
move.w #COPY_SIZEOF+8+7,d0
ELSE
move.w #(COPY_SIZEOF+8+7)&-8,d0
ENDC
moveq #MEMF_PUBLIC,d1
moveq #8,d2
call AllocMem
and.l d0,d2
add.l d2,d0 ; d0 = address aligned to 16
beq.b .nomem
lea (COPY,pc),a0
move.l d0,a1
move.l d0,a2
move.w #COPY_SIZEOF,d0 ; kindof a hack :-)
bsr.b _smallcopy
move.w #_LVOCopyMem,a0 ; some nice code reuse here...
bsr.b .sfunc
move.w #_LVOCopyMemQuick,a0
.sfunc move.l a2,d0
lea (new_CopyMemQuick-COPY,a2),a2
move.l a6,a1
call SetFunction
moveq #0,d0
rts
ENDC
ENDC
IFD _PHXASS_
CNOP 0,16
ENDC
COPY
new_CopyMem
tst.l d0 ; zero len copy?
beq.b _exit ; yep, quit
move.w a1,d1
btst #0,d1 ; destination aligned by 2?
beq.b .dst_align2 ; yep, try 4
move.b (a0)+,(a1)+ ; do the byte
subq.l #1,d0 ; sub count
beq.b _exit ; was the only byte, quit!
move.w a1,d1
.dst_align2
btst #1,d1 ; destination aligned by 4?
beq.b .dst_align4 ; yep, try big copy
cmpi.l #1,d0
bne.b .not_byte
.copy_byte
move.b (a0)+,(a1)+ ; copy last byte
rts
.not_byte
move.w (a0)+,(a1)+ ; do the word
subq.l #2,d0 ; sub count
beq.b _exit ; was the only word, quit!
.dst_align4
; now destination is longword aligned
cmpi.l #(2048+16),d0 ; worth the trouble?
bcc.b _bigcopy ; yep, do fast copy
_smallcopy
move.w d0,d1
lsr.w #2,d1
beq.b .nolongs
.copy move.l (a0)+,(a1)+
subq.w #1,d1
bne.b .copy
.nolongs
btst #1,d0 ; long copy done, one word left?
beq.b .no_last_word ; nope
move.w (a0)+,(a1)+ ; copy last word
.no_last_word
btst #0,d0 ; one byte left?
beq.b .no_last_byte ; nope
move.b (a0)+,(a1)+ ; copy last byte
.no_last_byte
_exit rts
_bigcopy
IFNE USE_MOVE16
IFNE SAFE_MOVE16
cmp.l #$01000000,a0
bcs.b .bigcopy_nomove16
move.w a1,d1
cmp.l #$01000000,a1
bcs.b .bigcopy_nomove16
ELSE
move.w a1,d1
ENDC
btst #2,d1 ; destination aligned by 8?
beq.b .dst_align8 ; yep, try 16
move.l (a0)+,(a1)+ ; nope, make it!
addq.w #4,d1
subq.l #4,d0
.dst_align8
btst #3,d1 ; destination aligned by 16?
beq.b .dst_align16 ; yep, try source
move.l (a0)+,(a1)+ ; nope, make it!
move.l (a0)+,(a1)+
subq.l #8,d0
.dst_align16
move.w a0,d1
and.w #15,d1 ; source aligned by 16?
bne.b .bigcopy_nomove16 ; nope, so lets use normal copy
; source: aligned by 16
; destination: aligned by 16
move.l d0,d1
lsr.l #8,d1 ; copy 256 bytes per go
.copy256
REPT 16
move16 (a0)+,(a1)+
ENDR
subq.l #1,d1
bne.b .copy256
and.w #255,d0 ; handle rest if needed
bne _smallcopy
rts
.bigcopy_nomove16
ENDC
IFNE USE_MOVEM
; unrolled movem.l loop, using 12 registers
movem.l d0/d2-d7/a2-a6,-(sp)
lsr.l #8,d0 ; copy 256 bytes per go
.mcopy256
movem.l (a0)+,d1-d7/a2-a6 ; 12*4=48
movem.l d1-d7/a2-a6,(a1)
movem.l (a0)+,d1-d7/a2-a6 ; 12*4=48
movem.l d1-d7/a2-a6,(1*48,a1)
movem.l (a0)+,d1-d7/a2-a6 ; 12*4=48
movem.l d1-d7/a2-a6,(2*48,a1)
movem.l (a0)+,d1-d7/a2-a6 ; 12*4=48
movem.l d1-d7/a2-a6,(3*48,a1)
movem.l (a0)+,d1-d7/a2-a6 ; 12*4=48
movem.l d1-d7/a2-a6,(4*48,a1)
movem.l (a0)+,d1-d4 ; 4*4=16
movem.l d1-d4,(5*48,a1)
subq.l #1,d0
lea (256,a1),a1
bne.b .mcopy256
movem.l (sp)+,d0/d2-d7/a2-a6
and.w #255,d0 ; handle rest if needed
ELSE
; unrolled move.l loop
move.l d0,d1
lsr.l #6,d1 ; copy 64 bytes per go
.mcopy64
REPT 16
move.l (a0)+,(a1)+
ENDR
subq.l #1,d1
bne.b .mcopy64
and.w #63,d0 ; handle rest if needed
ENDC
bne _smallcopy
rts
IFD _PHXASS_
CNOP 0,16
ELSE
IFGT 16-((*-COPY)&15)
dcb.w (16-((*-COPY)&15))>>1,$51FC
ENDC
ENDC
new_CopyMemQuick
lsr.l #2,d0 ; bytes -> longswords
cmpi.l #(2048+16)>>2,d0 ; worth the trouble?
bcc.b .bigcopy ; yep, do fast copy
tst.w d0
beq.b .exit ; nothing to copy, quit!
.smallcopy
.copy move.l (a0)+,(a1)+
subq.w #1,d0
bne.b .copy
.exit rts
.bigcopy
IFNE USE_MOVE16
IFNE SAFE_MOVE16
cmp.l #$01000000,a0
bcs.b .bigcopy_nomove16
move.w a1,d1
cmp.l #$01000000,a1
bcs.b .bigcopy_nomove16
ELSE
move.w a1,d1
ENDC
btst #2,d1 ; destination aligned by 8?
beq.b .dst_align8 ; yep, try 16
move.l (a0)+,(a1)+ ; nope, make it!
addq.w #4,d1
subq.l #4>>2,d0
.dst_align8
btst #3,d1 ; destination aligned by 16?
beq.b .dst_align16 ; yep, try source
move.l (a0)+,(a1)+ ; nope, make it!
move.l (a0)+,(a1)+
subq.l #8>>2,d0
.dst_align16
move.w a0,d1
and.w #15,d1 ; source aligned by 16?
bne.b .bigcopy_nomove16 ; nope, so lets use normal copy
; source: aligned by 16
; destination: aligned by 16
move.l d0,d1
lsr.l #8-2,d1 ; copy 256 bytes per go
.copy256
REPT 16
move16 (a0)+,(a1)+
ENDR
subq.l #1,d1
bne.b .copy256
and.w #255>>2,d0 ; handle rest if needed
bne .smallcopy
rts
.bigcopy_nomove16
ENDC
IFNE USE_MOVEM
; unrolled movem.l loop, using 12 registers
movem.l d0/d2-d7/a2-a6,-(sp)
lsr.l #8-2,d0 ; copy 256 bytes per go
.mcopy256
movem.l (a0)+,d1-d7/a2-a6 ; 12*4=48
movem.l d1-d7/a2-a6,(a1)
movem.l (a0)+,d1-d7/a2-a6 ; 12*4=48
movem.l d1-d7/a2-a6,(1*48,a1)
movem.l (a0)+,d1-d7/a2-a6 ; 12*4=48
movem.l d1-d7/a2-a6,(2*48,a1)
movem.l (a0)+,d1-d7/a2-a6 ; 12*4=48
movem.l d1-d7/a2-a6,(3*48,a1)
movem.l (a0)+,d1-d7/a2-a6 ; 12*4=48
movem.l d1-d7/a2-a6,(4*48,a1)
movem.l (a0)+,d1-d4 ; 4*4=16
movem.l d1-d4,(5*48,a1)
subq.l #1,d0
lea (256,a1),a1
bne.b .mcopy256
movem.l (sp)+,d0/d2-d7/a2-a6
and.w #255>>2,d0 ; handle rest if needed
ELSE
; unrolled move.l loop
move.l d0,d1
lsr.l #6-2,d1 ; copy 64 bytes per go
.mcopy64
REPT 16
move.l (a0)+,(a1)+
ENDR
subq.l #1,d1
bne.b .mcopy64
and.w #63>>2,d0 ; handle rest if needed
ENDC
bne .smallcopy
rts
COPY_SIZEOF EQU (*-COPY)
IFNE USE_MOVEM&SAFE_MOVEM
IFD _PHXASS_
CNOP 0,16
ENDC
COPY_MOVE
new_CopyMem_MOVE
tst.l d0 ; zero len copy?
beq.b .exit ; yep, quit
move.w a1,d1
btst #0,d1 ; destination aligned by 2?
beq.b .dst_align2 ; yep, try 4
move.b (a0)+,(a1)+ ; do the byte
subq.l #1,d0 ; sub count
beq.b .exit ; was the only byte, quit!
move.w a1,d1
.dst_align2
btst #1,d1 ; destination aligned by 4?
beq.b .dst_align4 ; yep, try big copy
cmpi.l #1,d0
bne.b .not_byte
.copy_byte
move.b (a0)+,(a1)+ ; copy last byte
rts
.not_byte
move.w (a0)+,(a1)+ ; do the word
subq.l #2,d0 ; sub count
beq.b .exit ; was the only word, quit!
.dst_align4
; now destination is longword aligned
cmpi.l #(2048+16),d0 ; worth the trouble?
bcc.b .bigcopy ; yep, do fast copy
.smallcopy
move.w d0,d1
lsr.w #2,d1
beq.b .nolongs
.copy move.l (a0)+,(a1)+
subq.w #1,d1
bne.b .copy
.nolongs
btst #1,d0 ; long copy done, one word left?
beq.b .no_last_word ; nope
move.w (a0)+,(a1)+ ; copy last word
.no_last_word
btst #0,d0 ; one byte left?
beq.b .no_last_byte ; nope
move.b (a0)+,(a1)+ ; copy last byte
.no_last_byte
.exit rts
.bigcopy
IFNE USE_MOVE16
IFNE SAFE_MOVE16
cmp.l #$01000000,a0
bcs.b .bigcopy_nomove16
move.w a1,d1
cmp.l #$01000000,a1
bcs.b .bigcopy_nomove16
ELSE
move.w a1,d1
ENDC
btst #2,d1 ; destination aligned by 8?
beq.b .dst_align8 ; yep, try 16
move.l (a0)+,(a1)+ ; nope, make it!
addq.w #4,d1
subq.l #4,d0
.dst_align8
btst #3,d1 ; destination aligned by 16?
beq.b .dst_align16 ; yep, try source
move.l (a0)+,(a1)+ ; nope, make it!
move.l (a0)+,(a1)+
subq.l #8,d0
.dst_align16
move.w a0,d1
and.w #15,d1 ; source aligned by 16?
bne.b .bigcopy_nomove16 ; nope, so lets use normal copy
; source: aligned by 16
; destination: aligned by 16
move.l d0,d1
lsr.l #8,d1 ; copy 256 bytes per go
.copy256
REPT 16
move16 (a0)+,(a1)+
ENDR
subq.l #1,d1
bne.b .copy256
and.w #255,d0 ; handle rest if needed
bne .smallcopy
rts
.bigcopy_nomove16
ENDC
; unrolled move.l loop
move.l d0,d1
lsr.l #6,d1 ; copy 64 bytes per go
.mcopy64
REPT 16
move.l (a0)+,(a1)+
ENDR
subq.l #1,d1
bne.b .mcopy64
and.w #63,d0 ; handle rest if needed
bne .smallcopy
rts
IFD _PHXASS_
CNOP 0,16
ELSE
IFGT 16-((*-COPY_MOVE)&15)
dcb.w (16-((*-COPY_MOVE)&15))>>1,$51FC
ENDC
ENDC
new_CopyMemQuick_MOVE
lsr.l #2,d0 ; bytes -> longswords
cmpi.l #(2048+16)>>2,d0 ; worth the trouble?
bcc.b .bigcopy ; yep, do fast copy
tst.w d0
beq.b .exit ; nothing to copy, quit!
.smallcopy
.copy move.l (a0)+,(a1)+
subq.w #1,d0
bne.b .copy
.exit rts
.bigcopy
IFNE USE_MOVE16
IFNE SAFE_MOVE16
cmp.l #$01000000,a0
bcs.b .bigcopy_nomove16
move.w a1,d1
cmp.l #$01000000,a1
bcs.b .bigcopy_nomove16
ELSE
move.w a1,d1
ENDC
btst #2,d1 ; destination aligned by 8?
beq.b .dst_align8 ; yep, try 16
move.l (a0)+,(a1)+ ; nope, make it!
addq.w #4,d1
subq.l #4>>2,d0
.dst_align8
btst #3,d1 ; destination aligned by 16?
beq.b .dst_align16 ; yep, try source
move.l (a0)+,(a1)+ ; nope, make it!
move.l (a0)+,(a1)+
subq.l #8>>2,d0
.dst_align16
move.w a0,d1
and.w #15,d1 ; source aligned by 16?
bne.b .bigcopy_nomove16 ; nope, so lets use normal copy
; source: aligned by 16
; destination: aligned by 16
move.l d0,d1
lsr.l #8-2,d1 ; copy 256 bytes per go
.copy256
REPT 16
move16 (a0)+,(a1)+
ENDR
subq.l #1,d1
bne.b .copy256
and.w #255>>2,d0 ; handle rest if needed
bne .smallcopy
rts
.bigcopy_nomove16
ENDC
; unrolled move.l loop
move.l d0,d1
lsr.l #6-2,d1 ; copy 64 bytes per go
.mcopy64
REPT 16
move.l (a0)+,(a1)+
ENDR
subq.l #1,d1
bne.b .mcopy64
and.w #63>>2,d0 ; handle rest if needed
bne .smallcopy
rts
COPY_MOVE_SIZEOF EQU (*-COPY_MOVE)
ENDC
IFNE SPEEDTEST
SECTION C,CODE
*******************************************
* CopyMemQuicker060 *
* Uses Move16 only in Adresses >$00ffffff *
* *
* Include File V1.4 *
* written by Dirk Busse *
* 3. Apr. 1999 *
*******************************************
PatchStart
tst.l d0
beq.b squit
move.w a1,d1 ;Zieladresse nach d1 (nur die ersten zwei Bits werden benötigt)
btst #0,d1
beq.b .skip1
move.b (a0)+,(a1)+ ;Zieladresse auf EVEN setzen
subq.l #1,d0
beq.b squit
move.w a1,d1
.skip1 btst #1,d1
beq.b .skip2
cmpi.l #1,d0
bne.b .two
move.b (a0)+,(a1)+
rts
; cnop 0,4
.two move.w (a0)+,(a1)+ ;Zieladresse auf LONG setzen
subq.l #2,d0
beq.b squit
.skip2
cmpi.l #%0000100000010000,d0 ;prüfen, ob d0 >=2064
bcc.b bigmove
smlmove move.l d0,d1
lsr.w #2,d1 ;d0 ist hier nur noch maximal 12Bit lang
beq.b .nolong
.loop move.l (a0)+,(a1)+
subq.w #1,d1
bne.b .loop
.nolong btst #1,d0
beq.b .skip
move.w (a0)+,(a1)+ ;letztes Word uebertragen
.skip btst #0,d0
beq.b squit
move.b (a0)+,(a1)+ ;letztes Byte uebertragen
squit rts
; cnop 0,4
bigmove cmp.l #$01000000,a0 ;Ist Quelladresse >$00ffffff ?
bcs.b bigmov
cmp.l #$01000000,a1 ;Ist Zieladresse >$00ffffff ?
bcs.b bigmov
move.w a1,d1
btst #2,d1
beq.b .skip1
subq.l #%100,d0 ;Zieladresse auf 8-Byte-Block legen
move.l (a0)+,(a1)+
move.w a1,d1
.skip1 btst #3,d1
beq.b .dest16
subq.l #%1000,d0 ;Zieladresse auf 16-Byte-Block legen
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
.dest16 move.w a0,d1
andi.b #15,d1
bne.b bigmov ;wenn Quelladresse nicht im 16-Byte-Block liegt
move.l d0,d1
lsr.l #7,d1
.loop move16 (a0)+,(a1)+
move16 (a0)+,(a1)+
move16 (a0)+,(a1)+
move16 (a0)+,(a1)+
move16 (a0)+,(a1)+
move16 (a0)+,(a1)+
move16 (a0)+,(a1)+
move16 (a0)+,(a1)+
subq.l #1,d1
bne.b .loop
and.w #%0000000001111111,d0 ;die oberen 16 Bits werden ab hier nichtmehr berücksichtigt
bne.b smlmove
rts
; cnop 0,4
bigmov move.l d0,d1
lsr.l #6,d1
.loop move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
subq.l #1,d1
bne.b .loop
and.w #%0000000000111111,d0 ;die oberen 16 Bits werden ab hier nichtmehr berücksichtigt
bne smlmove
rts
cnop 0,4
**************************************************************************
Quickest
lsr.l #2,d0
cmpi.l #%0000001000000100,d0 ;prüfen, ob d0 >=2064/4
bcc.b bigmo16
smlmovQ tst.w d0 ;d0 ist hier nur noch maximal 10 Bit lang
beq.b squitQ
sloopQ move.l (a0)+,(a1)+
subq.w #1,d0
bne.b sloopQ
squitQ rts
; cnop 0,4
bigmo16 cmp.l #$01000000,a0 ;Ist Quelladresse >$00ffffff ?
bcs.b bigmovQ
cmp.l #$01000000,a1 ;Ist Zieladresse >$00ffffff ?
bcs.b bigmovQ
move.w a1,d1
btst #2,d1
beq.b .skip1
subq.l #1,d0 ;Zieladresse auf 8-Byte-Block legen
move.l (a0)+,(a1)+
move.w a1,d1
.skip1 btst #3,d1
beq.b .dest16
subq.l #2,d0 ;Zieladresse auf 16-Byte-Block legen
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
.dest16 move.w a0,d1
andi.b #15,d1
bne.b bigmovQ ;wenn Quelladresse nicht im 16-Byte-Block liegt
move.l d0,d1
lsr.l #5,d1
.loop move16 (a0)+,(a1)+
move16 (a0)+,(a1)+
move16 (a0)+,(a1)+
move16 (a0)+,(a1)+
move16 (a0)+,(a1)+
move16 (a0)+,(a1)+
move16 (a0)+,(a1)+
move16 (a0)+,(a1)+
subq.l #1,d1
bne.b .loop
and.w #%0000000000011111,d0 ;die oberen 16 Bits werden ab hier nichtmehr berücksichtigt
bne.b sloopQ
rts
; cnop 0,4
bigmovQ move.l d0,d1
lsr.l #4,d1
.loop move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
move.l (a0)+,(a1)+
subq.l #1,d1
bne.b .loop
and.w #%0000000000001111,d0 ;die oberen 16 Bits werden ab hier nichtmehr berücksichtigt
bne sloopQ
rts
cnop 0,4
PatchEnd
ENDC