home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
AmigActive 13
/
AACD13.ISO
/
AACD
/
Sound
/
LAME
/
src
/
i386
/
fft3dn.nas
< prev
next >
Wrap
Text File
|
2000-01-30
|
12KB
|
673 lines
; back port from GOGO-no coda 2.24b by Takehiro TOMINAGA
; GOGO-no-coda
; Copyright (C) 1999 shigeo
; special thanks to URURI
%include "nasm.h"
externdef costab_fft
externdef sintab_fft
segment_data
align 32
D_1_41421 dd 1.41421356 , 1.41421356
D_1_0 dd 1.0 , 1.0
D_0_5 dd 0.5 , 0.5
D_0_25 dd 0.25 , 0.25
D_0_02236 dd 0.02236067 , 0.02236067
D_0_0005 dd 0.0005 , 0.0005
D_0_0 dd 0.0 , 0.0
D_1_0_D_0_0 dd 0.0 , 1.0
D_0_0_D_1_0 dd 1.0 , 0.0
D_MSB1_0 dd 0x00000000 , 0x80000000
D_MSB1_1 dd 0x80000000 , 0x80000000
D_MSB0_1 dd 0x80000000 , 0x00000000
segment_code
;void fht_3DN2(float *fz, int n);
proc fht_3DN2
%$fz arg 4
%$n arg 4
%$k local 4
%$Ps2_Pc2 local 8
%$Mc2_Ps2 local 8
%$t_s local 8
%$t_c local 8
alloc
femms
pushd ebp, ebx, esi, edi
fht_3DN_1st_part:
fht_3DN_2nd_part:
fht_3DN_3rd_part:
.do_init:
mov r3, 16 ;k1*fsize = 4*fsize = k4
mov r4, 8 ;kx = k1/2
mov r2, 48 ;k3*fsize
mov dword [sp(%$k)], 2 ;k = 2
mov r0, [sp(%$fz)] ;fi
lea r1, [r0+8] ;gi = fi + kx
jmp .do
align 16
.do:
pmov mm6, [D_MSB1_0] ;MSB1_0
pmov mm7, [D_1_41421]
.do2:
;f
pmov mm1, [r0+r3] ;fi1
pmov mm4, [r0+r2] ;fi3
pmov mm0, [r0] ;fi0
pmov mm3, [r0+r3*2] ;fi2
pupldq mm1, mm1
pupldq mm4, mm4
pupldq mm0, mm0 ;fi0 | fi0
pupldq mm3, mm3 ;fi2 | fi2
pxor mm1, mm6 ;-fi1 | fi1
pxor mm4, mm6 ;-fi3 | fi3
pfadd mm0, mm1 ;f1 | f0
pfadd mm3, mm4 ;f3 | f2
pmov mm4, mm0
pfadd mm0, mm3 ;fi1 | fi0
pfsub mm4, mm3 ;fi3 | fi2
pmovd [r0], mm0 ;fi[0]
puphdq mm0, mm0
pmovd [r0+r3*2], mm4 ;fi[k2]
puphdq mm4, mm4
pmovd [r0+r3], mm0 ;fi[k1]
pmovd [r0+r2], mm4 ;fi[k3]
;g
pmov mm1, [r1+r3] ;gi1
pmov mm0, [r1] ;gi0
pmov mm3, [r1+r3*2] ;gi2
pmov mm5, [r1+r2] ;gi3
pupldq mm1, mm1
pupldq mm0, mm0 ;gi0 | gi0
pupldq mm3, mm5 ;gi3 | gi2
pxor mm1, mm6 ;-gi1 | gi1
pfadd mm0, mm1 ;g1 | g0
pfmul mm3, mm7 ;g3 | g2
pmov mm4, mm0
pfadd mm0, mm3 ;gi1 | gi0
pfsub mm4, mm3 ;gi3 | gi2
pmovd [r1], mm0 ;gi[0]
puphdq mm0, mm0
pmovd [r1+r3*2], mm4 ;gi[k2]
puphdq mm4, mm4
pmovd [r1+r3], mm0 ;gi[k1]
pmovd [r1+r2], mm4 ;gi[k3]
lea r0, [r0+r3*4]
lea r1, [r1+r3*4]
cmp r0, r6
jb near .do2
mov r0, [sp(%$k)]
pmov mm0, [costab_fft +r0*4]
pmov mm1, [sintab_fft +r0*4]
pupldq mm0, mm0
pupldq mm1, mm1
pmov mm6, [D_1_0_D_0_0] ;c1 | s1
pmov mm7, [D_0_0_D_1_0] ;-s1 | c1
pmov [sp(%$t_c)], mm0
pmov [sp(%$t_s)], mm1
.for_init:
mov r5, 4 ;i = 1*fsize
jmp .for
align 16
.for:
pfmul mm6, [sp(%$t_c)] ;c1*t_c | s1*t_c
pfmul mm7, [sp(%$t_s)] ;-s1*t_s | c1*t_s
pfadd mm6, mm7 ;c1 | s1
pmov mm7, [D_MSB0_1]
pmov mm1, mm6
pxor mm7, mm6 ;c1 | -s1
puphdq mm1, mm1 ;c1
pmov mm0, mm7
pupldq mm2, mm7
pfmul mm0, mm6 ;c1*c1 | -s1*s1
pfmul mm1, mm6 ;c1*s1
puphdq mm7, mm2 ;-s1 | c1
pfacc mm0, mm0 ;c2
pfadd mm1, mm1 ;s2 = 2*c1*s1
pupldq mm1, mm0 ;c2 | s2
pupldq mm0, mm1 ;s2 | c2
pxor mm1, [D_MSB1_0] ;-c2 | s2
pmov [sp(%$Ps2_Pc2)], mm0
pmov [sp(%$Mc2_Ps2)], mm1
mov r0, [sp(%$fz)]
mov r1, [sp(%$fz)]
add r0, r5 ;r0 = fi
add r1, r3
sub r1, r5 ;r1 = gi
jmp .do3
align 16
.do3:
pmov mm2, [r0+r3]
pmov mm4, [r1+r3]
pmov mm3, [r0+r2]
pmov mm5, [r1+r2]
pupldq mm2, mm2
pupldq mm4, mm4
pupldq mm3, mm3
pupldq mm5, mm5
pmov mm0, [sp(%$Ps2_Pc2)]
pmov mm1, [sp(%$Mc2_Ps2)]
pfmul mm2, mm0 ;s2 * fi1 | c2 * fi1
pfmul mm4, mm1 ;-c2 * gi1 | s2 * gi1
pfmul mm3, mm0 ;s2 * fi3 | c2 * fi3
pfmul mm5, mm1 ;-c2 * gi3 | s2 * gi3
pfadd mm2, mm4 ;b | a
pfadd mm3, mm5 ;d | c
pmov mm0, [r0]
pmov mm4, [r1]
pmov mm1, [r0+r3*2]
pmov mm5, [r1+r3*2]
pupldq mm0, mm4 ;gi0 | fi0
pupldq mm1, mm5 ;gi2 | fi2
pmov mm4, mm2
pmov mm5, mm3
pfadd mm2, mm0 ;g0 | f0
pfadd mm3, mm1 ;g2 | f2
pfsub mm0, mm4 ;g1 | f1
pfsub mm1, mm5 ;g3 | f3
pmov mm4, mm3
pmov mm5, mm1
pupldq mm4, mm4 ;f2 | f2
puphdq mm5, mm5 ;g3 | g3
puphdq mm3, mm3 ;g2 | g2
pupldq mm1, mm1 ;f3 | f3
pfmul mm4, mm6 ;f2 * c1 | f2 * s1
pfmul mm5, mm7 ;g3 * -s1 | g3 * c1
pfmul mm3, mm6 ;g2 * c1 | g2 * s1
pfmul mm1, mm7 ;f3 * -s1 | f3 * c1
pfsub mm4, mm5 ;a | b
pfadd mm3, mm1 ;d | c
pmov mm5, mm2
pmov mm1, mm0
pupldq mm2, mm2 ;f0 | f0
pupldq mm0, mm0 ;f1 | f1
puphdq mm1, mm2 ;f0 | g1
puphdq mm5, mm0 ;f1 | g0
pmov mm2, mm4
pmov mm0, mm3
pfadd mm4, mm1 ;fi0 | gi1
pfadd mm3, mm5 ;fi1 | gi0
pfsub mm1, mm2 ;fi2 | gi3
pfsub mm5, mm0 ;fi3 | gi2
pmovd [r1+r3], mm4 ;gi[k1]
puphdq mm4, mm4
pmovd [r1], mm3 ;gi[0]
puphdq mm3, mm3
pmovd [r1+r2], mm1 ;gi[k3]
puphdq mm1, mm1
pmovd [r1+r3*2], mm5 ;gi[k2]
puphdq mm5, mm5
pmovd [r0], mm4 ;fi[0]
pmovd [r0+r3], mm3 ;fi[k1]
pmovd [r0+r3*2], mm1 ;fi[k2]
pmovd [r0+r2], mm5 ;fi[k3]
lea r0, [r0+r3*4]
lea r1, [r1+r3*4]
cmp r0, r6
jb near .do3
add r5, 4
cmp r5, r4
jb near .for
cmp r3, [sp(%$n)]
jae .exit
add dword [sp(%$k)], 2 ;k += 2;
lea r3, [r3*4] ;k1 *= 4
lea r2, [r2*4] ;k3 *= 4
lea r4, [r4*4] ;kx *= 4
mov r0, [sp(%$fz)] ;fi
lea r1, [r0+r4] ;gi = fi + kx
jmp .do
.exit:
femms
popd ebp, ebx, esi, edi
endproc
;***********************************************************************
%ifdef USE_E3DN
;void fht_E3DN(float *fz, int n);
proc fht_E3DN
%$fz arg 4
%$n arg 4
%$k local 4
%$Ps2_Pc2 local 8
%$Mc2_Ps2 local 8
%$t_s local 8
%$t_c local 8
alloc
femms
pushd ebp, ebx, esi, edi
fht_E3DN_1st_part:
fht_E3DN_2nd_part:
fht_E3DN_3rd_part:
.do_init:
mov r3, 16 ;k1*fsize = 4*fsize = k4
mov r4, 8 ;kx = k1/2
mov r2, 48 ;k3*fsize
mov dword [sp(%$k)], 2 ;k = 2
mov r0, [sp(%$fz)] ;fi
lea r1, [r0+8] ;gi = fi + kx
jmp .do
align 16
.do:
pmov mm7, [D_1_41421]
.do2:
pmov mm0, [r0] ;fi0
pupldq mm0, [r0+r3] ;fi1 | fi0
pmov mm1, [r0+r3*2] ;fi2
pupldq mm1, [r0+r2] ;fi3 | fi2
pmov mm3, [r1] ;gi0
pupldq mm3, [r1+r3] ;gi1 | gi0
pmov mm4, [r1+r2] ;gi3
pupldq mm4, [r1+r3*2] ;gi2 | gi3
pfpnacc mm0, mm0 ;f0 | f1
pfpnacc mm1, mm1 ;f2 | f3
pfpnacc mm3, mm3 ;g0 | g1
pfmul mm4, mm7 ;g2 | g3
pmov mm2, mm0
pfadd mm0, mm1 ;fi0 | fi1
pfsub mm2, mm1 ;fi2 | fi3
pmov mm5, mm3
pfadd mm3, mm4 ;gi0 | gi1
pfsub mm5, mm4 ;gi2 | gi3
pmovd [r0+r3], mm0 ;fi[k1]
puphdq mm0, mm0
pmovd [r0+r2], mm2 ;fi[k3]
puphdq mm2, mm2
pmovd [r1+r3], mm3 ;gi[k1]
puphdq mm3, mm3
pmovd [r1+r2], mm5 ;gi[k3]
puphdq mm5, mm5
pmovd [r0], mm0 ;fi[0]
pmovd [r0+r3*2], mm2 ;fi[k2]
pmovd [r1], mm3 ;gi[0]
pmovd [r1+r3*2], mm5 ;gi[k2]
lea r0, [r0+r3*4]
lea r1, [r1+r3*4]
cmp r0, r6
jb near .do2
mov r0, [sp(%$k)]
pmov mm0, [costab_fft +r0*4]
pmov mm1, [sintab_fft +r0*4]
pupldq mm0, mm0
pupldq mm1, mm1
pmov mm6, [D_1_0_D_0_0] ;c1 | s1
pmov mm7, [D_0_0_D_1_0] ;-s1 | c1
pmov [sp(%$t_c)], mm0
pmov [sp(%$t_s)], mm1
.for_init:
mov r5, 4 ;i = 1*fsize
jmp .for
align 16
.for:
pfmul mm6, [sp(%$t_c)] ;c1*t_c | s1*t_c
pfmul mm7, [sp(%$t_s)] ;-s1*t_s | c1*t_s
pfadd mm6, mm7 ;c1 | s1
pmov mm7, [D_MSB0_1]
pswapd mm1, mm6 ;s1 | c1
pswapd mm0, mm6
pxor mm7, mm6 ;c1 | -s1
pfmul mm1, mm6 ;c1*s1 | c1*s1
pfmul mm0, mm0 ;s1*s1 | c1*c1
pswapd mm7, mm7 ;-s1 | c1
pfpnacc mm0, mm1 ;s2 = 2*c1*s1 | c2 = c1*c1-s1*s1
pswapd mm1, mm0 ;c2 | s2
pxor mm1, [D_MSB1_0] ;-c2 | s2
pmov [sp(%$Ps2_Pc2)], mm0
pmov [sp(%$Mc2_Ps2)], mm1
mov r0, [sp(%$fz)]
mov r1, [sp(%$fz)]
add r0, r5 ;r0 = fi
add r1, r3
sub r1, r5 ;r1 = gi
jmp .do3
align 16
.do3:
pmov mm0, [r0+r2]
pmov mm2, [r1+r2]
pmov mm1, [r0+r3]
pmov mm3, [r1+r3]
pupldq mm0, mm0
pupldq mm2, mm2
pupldq mm1, mm1
pupldq mm3, mm3
pmov mm4, [sp(%$Ps2_Pc2)]
pmov mm5, [sp(%$Mc2_Ps2)]
pfmul mm0, mm4 ;s2 * fi3 | c2 * fi3
pfmul mm2, mm5 ;-c2 * gi3 | s2 * gi3
pfmul mm1, mm4 ;s2 * fi1 | c2 * fi1
pfmul mm3, mm5 ;-c2 * gi1 | s2 * gi1
pfadd mm0, mm2 ;d | c
pfadd mm1, mm3 ;b | a
pmov mm2, [r0+r3*2] ;fi2
pupldq mm3, [r1+r3*2] ;gi2 | -
pmov mm4, [r0] ;fi0
pupldq mm5, [r1] ;gi0 | -
pupldq mm2, mm0 ;c | fi2
puphdq mm3, mm0 ;d | gi2
pupldq mm4, mm1 ;a | fi0
puphdq mm5, mm1 ;b | gi0
pfpnacc mm2, mm2 ;f2 | f3
pfpnacc mm3, mm3 ;g2 | g3
pfpnacc mm4, mm4 ;f0 | f1
pfpnacc mm5, mm5 ;g0 | g1
pmov mm0, mm2
pmov mm1, mm3
pupldq mm2, mm2 ;f3 | f3
pupldq mm3, mm3 ;g3 | g3
puphdq mm0, mm0 ;f2 | f2
puphdq mm1, mm1 ;g2 | g2
pswapd mm4, mm4 ;f1 | f0
pswapd mm5, mm5 ;g1 | g0
pfmul mm0, mm7 ;f2 *-s1 | f2 * c1
pfmul mm3, mm6 ;g3 * c1 | g3 * s1
pfmul mm1, mm6 ;g2 * c1 | g2 * s1
pfmul mm2, mm7 ;f3 *-s1 | f3 * c1
pfadd mm0, mm3 ;-b | a
pfadd mm1, mm2 ; d | c
pmov mm2, mm5
pmov mm3, mm4
pupldq mm4, mm0 ; a | f0
pupldq mm5, mm1 ; c | g0
puphdq mm2, mm0 ;-b | g1
puphdq mm3, mm1 ; d | f1
pfpnacc mm4, mm4 ;fi0 | fi2
pfpnacc mm5, mm5 ;gi0 | gi2
pfpnacc mm2, mm2 ;gi3 | gi1
pfpnacc mm3, mm3 ;fi1 | fi3
pmovd [r0+r3*2], mm4 ;fi[k2]
puphdq mm4, mm4
pmovd [r1+r3*2], mm5 ;gi[k2]
puphdq mm5, mm5
pmovd [r1+r3], mm2 ;gi[k1]
puphdq mm2, mm2
pmovd [r0+r2], mm3 ;fi[k3]
puphdq mm3, mm3
pmovd [r0], mm4 ;fi[0]
pmovd [r1], mm5 ;gi[0]
pmovd [r1+r2], mm2 ;gi[k3]
pmovd [r0+r3], mm3 ;fi[k1]
lea r0, [r0+r3*4]
lea r1, [r1+r3*4]
cmp r0, r6
jb near .do3
add r5, 4
cmp r5, r4
jb near .for
cmp r3, [sp(%$n)]
jae .exit
add dword [sp(%$k)], 2 ;k += 2;
lea r3, [r3*4] ;k1 *= 4
lea r2, [r2*4] ;k3 *= 4
lea r4, [r4*4] ;kx *= 4
mov r0, [sp(%$fz)] ;fi
lea r1, [r0+r4] ;gi = fi + kx
jmp .do
.exit:
femms
popd ebp, ebx, esi, edi
endproc
%endif
;***********************************************************************
;void fft_side_3DN(float in[2][1024], int s, float *ret); /* s = MSFREQ ¤Ï4¤ÎÇÜ¿ô¤ò²¾Äê¡£ºîÀ®»þ¤Ï20¤À¤Ã¤¿ */
proc fft_side_3DN
%$in arg 4
%$s arg 4
%$pret arg 4
femms
pushd ebx, esi
.for_init:
mov r0, [sp(%$in)] ;r0 = &in[0][0]
lea r1, [r0+fsizen(1024)] ;r1 = &in[1][0]
mov r2, [sp(%$s)]
mov r3, fsizen(1023)
shl r2, 2 ;r2 = s * fsize
sub r3, r2 ;r3 = (1023-s) * fsize
mov r4, fsizen(512) ;r4 = 512 * fsize
pxor mm7, mm7
jmp .for
align 16
.for:
pmov mm0, [r0+r3]
pmov mm1, [r1+r3]
pmov mm2, [r0+r2]
pfsub mm0, mm1
pmov mm3, [r1+r2]
pfsub mm2, mm3
pmov mm4, [r0+r3-fsizen(2)]
pfmul mm0, mm0
pmov mm5, [r1+r3-fsizen(2)]
pfmul mm2, mm2
pmov mm1, [r0+r2+fsizen(2)]
pupldq mm6, mm0
pmov mm3, [r1+r2+fsizen(2)]
pfadd mm7, mm2
pfsub mm4, mm5
puphdq mm0, mm6
pfsub mm1, mm3
pfadd mm7, mm0
pfmul mm4, mm4
add r2, fsizen(4)
pfmul mm1, mm1
sub r3, fsizen(4)
pupldq mm6, mm4
pfadd mm7, mm1
cmp r2, r4
puphdq mm4, mm6
pfadd mm7, mm4
jb .for
pmov mm0, [r0+r2]
pmov mm1, [r1+r2]
pmov mm2, [D_0_25]
pfsub mm0, mm1
pmov mm3, [D_0_5]
pfacc mm7, mm7
pfmul mm0, mm0
pfmul mm7, mm2
pfmul mm0, mm3
mov r0, [sp(%$pret)]
pfadd mm0, mm7
pmovd [r0], mm0
.exit:
femms
popd ebx, esi
endproc
;***********************************************************************
%ifdef USE_E3DN
;void fft_side_E3DN(float in[2][1024], int s, float *ret); /* s = MSFREQ ¤Ï4¤ÎÇÜ¿ô¤ò²¾Äê¡£ºîÀ®»þ¤Ï20¤À¤Ã¤¿ */
proc fft_side_E3DN
%$in arg 4
%$s arg 4
%$pret arg 4
femms
pushd ebx, esi
.for_init:
mov r0, [sp(%$in)] ;r0 = &in[0][0]
lea r1, [r0+fsizen(1024)] ;r1 = &in[1][0]
mov r2, [sp(%$s)]
mov r3, fsizen(1023)
shl r2, 2 ;r2 = s * fsize
sub r3, r2 ;r3 = (1023-s) * fsize
mov r4, fsizen(512) ;r4 = 512 * fsize
pxor mm7, mm7
jmp .for
align 16
.for:
pmov mm0, [r0+r3]
pfsub mm0, [r1+r3]
pmov mm4, [r0+r3-fsizen(2)]
pfsub mm4, [r1+r3-fsizen(2)]
pmov mm2, [r0+r2]
pfsub mm2, [r1+r2]
pmov mm1, [r0+r2+fsizen(2)]
pfsub mm1, [r1+r2+fsizen(2)]
pfmul mm0, mm0
pfmul mm4, mm4
pfmul mm2, mm2
pfmul mm1, mm1
pswapd mm0, mm0
pswapd mm4, mm4
pfadd mm7, mm2
add r2, fsizen(4)
pfadd mm7, mm1
sub r3, fsizen(4)
pfadd mm7, mm0
cmp r2, r4
pfadd mm7, mm4
jb .for
pmov mm0, [r0+r2]
pfsub mm0, [r1+r2]
pfacc mm7, mm7
pfmul mm0, mm0
pfmul mm7, [D_0_25]
pfmul mm0, [D_0_5]
mov r0, [sp(%$pret)]
pfadd mm0, mm7
pmovd [r0], mm0
.exit:
femms
popd ebx, esi
endproc
%endif