blob: 337de56c39468f1e6bdce0fd07afa9209b875a6a [file] [log] [blame]
;// file : mmx_zoom.s
;// author : JC Hoelt <jeko@free.fr>
;//
;// history
;// 07/01/2001 : Changing FEMMS to EMMS : slower... but run on intel machines
;// 03/01/2001 : WIDTH and HEIGHT are now variable
;// 28/12/2000 : adding comments to the code, suppress some useless lines
;// 27/12/2000 : reducing memory access... improving performance by 20%
;// coefficients are now on 1 byte
;// 22/12/2000 : Changing data structure
;// 16/12/2000 : AT&T version
;// 14/12/2000 : unrolling loop
;// 12/12/2000 : 64 bits memory access
.data
thezero:
.long 0x00000000
.long 0x00000000
.text
.globl mmx_zoom ;// name of the function to call by C program
.extern coeffs ;// the transformation buffer
.extern expix1,expix2 ;// the source and destination buffer
.extern mmx_zoom_size, zoom_width ;// size of the buffers
.align 16
mmx_zoom:
push %ebp
push %esp
;// initialisation du mm7 à zero
movq (thezero), %mm7
movl zoom_width, %eax
movl $4, %ebx
mull %ebx
movl %eax, %ebp
movl (coeffs), %eax
movl (expix1), %edx
movl (expix2), %ebx
movl $10, %edi
movl mmx_zoom_size, %ecx
.while:
;// esi <- nouvelle position
movl (%eax), %esi
leal (%edx, %esi), %esi
;// recuperation des deux premiers pixels dans mm0 et mm1
movq (%esi), %mm0 /* b1-v1-r1-a1-b2-v2-r2-a2 */
movq %mm0, %mm1 /* b1-v1-r1-a1-b2-v2-r2-a2 */
;// recuperation des 4 coefficients
movd 4(%eax), %mm6 /* ??-??-??-??-c4-c3-c2-c1 */
;// depackage du premier pixel
punpcklbw %mm7, %mm0 /* 00-b2-00-v2-00-r2-00-a2 */
movq %mm6, %mm5 /* ??-??-??-??-c4-c3-c2-c1 */
;// depackage du 2ieme pixel
punpckhbw %mm7, %mm1 /* 00-b1-00-v1-00-r1-00-a1 */
;// extraction des coefficients...
punpcklbw %mm5, %mm6 /* c4-c4-c3-c3-c2-c2-c1-c1 */
movq %mm6, %mm4 /* c4-c4-c3-c3-c2-c2-c1-c1 */
movq %mm6, %mm5 /* c4-c4-c3-c3-c2-c2-c1-c1 */
punpcklbw %mm5, %mm6 /* c2-c2-c2-c2-c1-c1-c1-c1 */
punpckhbw %mm5, %mm4 /* c4-c4-c4-c4-c3-c3-c3-c3 */
movq %mm6, %mm3 /* c2-c2-c2-c2-c1-c1-c1-c1 */
punpcklbw %mm7, %mm6 /* 00-c1-00-c1-00-c1-00-c1 */
punpckhbw %mm7, %mm3 /* 00-c2-00-c2-00-c2-00-c2 */
;// multiplication des pixels par les coefficients
pmullw %mm6, %mm0 /* c1*b2-c1*v2-c1*r2-c1*a2 */
pmullw %mm3, %mm1 /* c2*b1-c2*v1-c2*r1-c2*a1 */
paddw %mm1, %mm0
;// ...extraction des 2 derniers coefficients
movq %mm4, %mm5 /* c4-c4-c4-c4-c3-c3-c3-c3 */
punpcklbw %mm7, %mm4 /* 00-c3-00-c3-00-c3-00-c3 */
punpckhbw %mm7, %mm5 /* 00-c4-00-c4-00-c4-00-c4 */
;// recuperation des 2 derniers pixels
movq (%esi,%ebp), %mm1
movq %mm1, %mm2
;// depackage des pixels
punpcklbw %mm7, %mm1
punpckhbw %mm7, %mm2
;// multiplication pas les coeffs
pmullw %mm4, %mm1
pmullw %mm5, %mm2
;// ajout des valeurs obtenues à la valeur finale
paddw %mm1, %mm0
paddw %mm2, %mm0
;// division par 256 = 16+16+16+16, puis repackage du pixel final
psrlw $8, %mm0
packuswb %mm7, %mm0
;// passage au suivant
leal 8(%eax), %eax
decl %ecx
;// enregistrement du resultat
movd %mm0, (%ebx)
leal 4(%ebx), %ebx
;// test de fin du tantque
cmpl $0, %ecx ;// 400x300
jz .fin_while
jmp .while
.fin_while:
emms
pop %esp
pop %ebp
ret ;//The End