gst/goom/ppc_zoom_ultimate.s - imx-gst-plugins-good - Git at Google

 ; PowerPC optimized zoom for Goom
 ; © 2001-2003 Guillaume Borios
 ; This library is free software; you can redistribute it and/or
 ; modify it under the terms of the GNU Library General Public
 ; License as published by the Free Software Foundation; either
 ; version 2 of the License, or (at your option) any later version.
 ;
 ; This library is distributed in the hope that it will be useful,
 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ; Library General Public License for more details.
 ;
 ; You should have received a copy of the GNU Library General Public
 ; License along with this library; if not, write to the
 ; Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
 ; Boston, MA 02110-1301, USA.

 ; Change log :
 ; 21 Dec 2003 : Use of altivec is now determined with a parameter

 ; Section definition : We use a read only section
 .text

 ; name of the function to call by C program : ppc_zoom
 ; We declare this label as a global to extend its scope outside this file
 .globl _ppc_zoom_generic
 .globl _ppc_zoom_G4

 ; Description :
 ; This routine dynamically computes and applies a zoom filter

 ; parameters :
 ; r3  <=> unsigned int sizeX (in pixels)
 ; r4  <=> unsigned int sizeY (in pixels)
 ; r5  <=> unsigned int * frompixmap
 ; r6  <=> unsigned int * topixmap
 ; r7  <=> unsigned int * brutS
 ; r8  <=> unsigned int * brutD
 ; r9  <=> unsigned int buffratio
 ; r10 <=> int [16][16] precalccoeffs

 ; globals after init
 ; r5  <=> frompixmap - 1 byte needed for preincremental fetch (replaces r5)
 ; r6  <=> topixmap - 1 byte needed for preincremental fetch (replaces r6)
 ; r3 <=> ax = x max in 16th of pixels (replaces old r3)
 ; r4 <=> ay = y max in 16th of pixels (replaces old r4)
 ; r20 <=> row size in bytes
 ; r12 <=> 0xFF00FF (mask for parallel 32 bits pixs computing)
 ; r30 <=> brutS - 1 byte needed for preincremental fetch (replaces r7)
 ; r31 <=> brutD - 1 byte needed for preincremental fetch (replaces r8)

 ; ABI notes :
 ; r1 is the Stack Pointer (SP) => Do not use
 ; r13..r31 are non-volatiles => Do not use

 _ppc_zoom_generic:

 ; Saves the used non volatile registers in the Mach-O stack s Red-Zone
 stmw 	r18,-56(r1)

 ; init
 li      r18,0		; Default value if out of range : 0 (Black)
 mr      r11,r10
 lis     r12,0xFF
 mullw   r2,r3,r4	; Number of pixels to compute
 subi    r30,r8,0
 slwi	r20,r3,2
 srawi   r19,r20,2
 ori     r12,r12,0xFF
 subi    r3,r3,1
 subi    r4,r4,1
 mtspr	ctr,r2		; Init the loop count (one loop per pixel computed)
 subi    r31,r7,0
 subi    r6,r6,4
 slwi	r3,r3,4
 slwi	r4,r4,4

 ;pre init for loop
 lwz	r2,0(r31)    ; px
 lwz	r29,4(r31)   ; py
 lwz	r8,0(r30)    ; px2
 lwz	r10,4(r30)   ; py2

 b       L1
 .align  5
 L1:

 ; computes dynamically the position to fetch
 sub     r8,r8,r2
 sub     r10,r10,r29
 mullw   r8,r8,r9
 addi    r31,r31,8
 mullw   r10,r10,r9
 addi    r30,r30,8

 srawi   r8,r8,16
 srawi   r10,r10,16
 add     r2,r2,r8
 add     r29,r29,r10

 ; if px>ax or py>ay goto outofrange
 ; computes the attenuation coeffs and the original point address
 rlwinm  r10,r2,6,28-6,31-6 ; r10 <- (r2 << 2) & 0x000002D0   (r10=(r2%16)*4*16)
 cmpl    cr4,0,r2,r3
 rlwimi  r10, r29, 2, 28-2, 31-2 ; r10 <- ((r29 << 2) & 0x0000002D) | (r10 & !0x0000002D)      (r10=(r10%16)*4 | r10)
 cmpl    cr7,0,r29,r4
 srawi   r29,r29,4     ; pos computing
 bge-	cr4,L4
 srawi   r2,r2,4       ; pos computing
 mullw   r29, r29,r19  ; pos computing
 bge-	cr7,L4

 ; Channels notation : 00112233 (AARRVVBB)

 add     r2,r2,r29    		; pos computing
 lwzx    r10,r11,r10		; Loads coefs
 slwi    r2,r2,2      		; pos computing
 add	r2,r2,r5     		; pos computing
 rlwinm  r21,r10,0,24,31	        ; Isolates coef1 (??????11 -> 00000011)
 lwz	r25,0(r2)		; Loads col1 -> r25
 lwz	r26,4(r2)		; Loads col2 -> r26
 rlwinm  r22,r10,24,24,31	; Isolates coef2 (????22?? -> 00000022)
 rlwinm  r23,r10,16,24,31	; Isolates coef3 (??33???? -> 00000033)
 add	r2,r2,r20		; Adds one line for future load of col3 and col4
 and	r8, r25,r12		; Masks col1 channels 1 & 3 : 0x00XX00XX
 rlwinm  r24,r10,8,24,31		; Isolates coef4 (44?????? -> 00000044)
 andi.	r25,r25,0xFF00		; Masks col1 channel 2 : 0x0000XX00
 mullw	r8, r8, r21		; Applies coef1 on col1 channels 1 & 3


 ; computes final pixel color
 and	r10,r26,r12		; Masks col2 channels 1 & 3 : 0x00XX00XX
 lwz	r27,0(r2)		; Loads col3 -> r27
 mullw	r10,r10,r22		; Applies coef2 on col2 channels 1 & 3
 mullw	r25,r25,r21		; Applies coef1 on col1 channel 2
 andi.	r29,r26,0xFF00		; Masks col2 channel 2 : 0x0000XX00
 mullw	r29,r29,r22		; Applies coef2 on col2 channel 2
 lwz	r28,4(r2)		; Loads col4 -> r28
 add	r8 ,r8 ,r10		; Adds col1 & col2 channels 1 & 3
 and	r10,r27,r12		; Masks col3 channels 1 & 3 : 0x00XX00XX
 add	r25,r25,r29		; Adds col1 & col2 channel 2
 mullw	r10,r10,r23		; Applies coef3 on col3 channels 1 & 3
 andi.	r29,r27,0xFF00		; Masks col3 channel 2 : 0x0000XX00
 mullw	r29,r29,r23		; Applies coef3 on col3 channel 2
 lwz	r2,0(r31)		; px
 add	r7 ,r8 ,r10		; Adds col3 to (col1 + col2) channels 1 & 3
 and	r10,r28,r12		; Masks col4 channels 1 & 3 : 0x00XX00XX
 mullw	r10,r10,r24		; Applies coef4 on col4 channels 1 & 3
 add	r25,r25,r29		; Adds col 3 to (col1 + col2) channel 2
 lwz 	r8,0(r30)    		; px2
 andi.	r28,r28,0xFF00		; Masks col4 channel 2 : 0x0000XX00
 add	r7 ,r7 ,r10		; Adds col4 to (col1 + col2 + col3) channels 1 & 3
 lwz	r10,4(r30)   		; py2
 mullw	r28,r28,r24		; Applies coef4 on col4 channel 2
 srawi	r7, r7, 8		; (sum of channels 1 & 3) >> 8
 lwz	r29,4(r31)              ; py
 add	r25,r25,r28		; Adds col 4 to (col1 + col2 + col3) channel 2
 rlwimi  r7, r25, 24, 16, 23	; (((sum of channels 2) >> 8 ) & 0x0000FF00) | ((sum of channels 1 and 3) & 0xFFFF00FF)
 stwu	r7,4(r6)		; Stores the computed pixel
 bdnz	L1			; Iterate again if needed
 b       L3	;goto end	; If not, returns from the function


 ; if out of range
 L4:
 stwu	r18,4(r6)
 lwz	r8,0(r30)    ; px2
 lwz	r10,4(r30)   ; py2
 lwz	r2,0(r31)    ; px
 lwz	r29,4(r31)   ; py
 bdnz	L1


 L3:

 ; Restore saved registers and return
 lmw	r18,-56(r1)
 blr


 _ppc_zoom_G4:

 ; Saves the used non volatile registers in the Mach-O stack s Red-Zone
 stmw 	r17,-60(r1)

 ; init
 li      r18,0		; Default value if out of range : 0 (Black)
 mr      r11,r10
 lis     r12,0xFF
 mullw   r2,r3,r4	; Number of pixels to compute
 subi    r30,r8,0
 slwi	r20,r3,2
 srawi   r19,r20,2
 ori     r12,r12,0xFF
 subi    r3,r3,1
 subi    r4,r4,1
 mtspr	ctr,r2		; Init the loop count (one loop per pixel computed)
 subi    r31,r7,0
 subi    r6,r6,4
 slwi	r3,r3,4
 slwi	r4,r4,4

 ;pre init for loop
 lwz	r2,0(r31)    ; px
 lwz	r29,4(r31)   ; py
 lwz	r8,0(r30)    ; px2
 lwz	r10,4(r30)   ; py2

 ;*********************
 lis     r17,0x0F01

 b       L100
 .align  5
 L100:

 addi    r6,r6,4

 ; Optimization to ensure the destination buffer
 ; won't be loaded into the data cache
 rlwinm. r0,r6,0,27,31
 bne+    L500
 dcbz    0,r6
 ;dcba    0,r6
 L500:

 ; computes dynamically the position to fetch
 ;mullw   r8,r8,r29
 ;mullw   r2,r2,r29
 ;add     r2,r8,r2
 ;srawi   r2,r2,17

 sub     r8,r8,r2
 sub     r10,r10,r29
 mullw   r8,r8,r9
 addi    r31,r31,8
 mullw   r10,r10,r9
 addi    r30,r30,8

 dst     r30,r17,0

 srawi    r8,r8,16
 srawi    r10,r10,16
 add     r2,r2,r8
 add     r29,r29,r10

 dst     r31,r17,1

 ; if px>ax or py>ay goto outofrange
 ; computes the attenuation coeffs and the original point address
 rlwinm  r10,r2,6,28-6,31-6 ; r10 <- (r2 << 2) & 0x000002D0   (r10=(r2%16)*4*16)
 cmpl    cr4,0,r2,r3
 rlwimi  r10, r29, 2, 28-2, 31-2 ; r10 <- ((r29 << 2) & 0x0000002D) | (r10 & !0x0000002D)      (r10=(r29%16)*4 | r10)
 cmpl    cr7,0,r29,r4
 srawi   r29,r29,4     ; pos computing
 bge-	cr4,L400
 srawi   r2,r2,4       ; pos computing
 mullw   r29, r29,r19  ; pos computing
 bge-	cr7,L400

 ; Channels notation : 00112233 (AARRVVBB)

 add     r2,r2,r29    		; pos computing
 lwzx    r10,r11,r10		; Loads coefs
 slwi    r2,r2,2      		; pos computing
 add	r2,r2,r5     		; pos computing
 rlwinm  r21,r10,0,24,31	        ; Isolates coef1 (??????11 -> 00000011)
 lwz	r25,0(r2)		; Loads col1 -> r25
 lwz	r26,4(r2)		; Loads col2 -> r26
 rlwinm  r22,r10,24,24,31	; Isolates coef2 (????22?? -> 00000022)
 rlwinm  r23,r10,16,24,31	; Isolates coef3 (??33???? -> 00000033)
 add	r2,r2,r20		; Adds one line for future load of col3 and col4
 and	r8, r25,r12		; Masks col1 channels 1 & 3 : 0x00XX00XX
 rlwinm  r24,r10,8,24,31		; Isolates coef4 (44?????? -> 00000044)
 dst     r2,r17,2
 rlwinm  r25,r25,0,16,23		; Masks col1 channel 2 : 0x0000XX00
 ;andi.	r25,r25,0xFF00		; Masks col1 channel 2 : 0x0000XX00
 mullw	r8, r8, r21		; Applies coef1 on col1 channels 1 & 3


 ; computes final pixel color
 and	r10,r26,r12		; Masks col2 channels 1 & 3 : 0x00XX00XX
 lwz	r27,0(r2)		; Loads col3 -> r27
 mullw	r10,r10,r22		; Applies coef2 on col2 channels 1 & 3
 mullw	r25,r25,r21		; Applies coef1 on col1 channel 2
 rlwinm  r29,r26,0,16,23		; Masks col2 channel 2 : 0x0000XX00
 ;andi.	r29,r26,0xFF00		; Masks col2 channel 2 : 0x0000XX00
 mullw	r29,r29,r22		; Applies coef2 on col2 channel 2
 lwz	r28,4(r2)		; Loads col4 -> r28
 add	r8 ,r8 ,r10		; Adds col1 & col2 channels 1 & 3
 and	r10,r27,r12		; Masks col3 channels 1 & 3 : 0x00XX00XX
 add	r25,r25,r29		; Adds col1 & col2 channel 2
 mullw	r10,r10,r23		; Applies coef3 on col3 channels 1 & 3
 rlwinm  r29,r27,0,16,23		; Masks col3 channel 2 : 0x0000XX00
 ;andi.	r29,r27,0xFF00		; Masks col3 channel 2 : 0x0000XX00
 mullw	r29,r29,r23		; Applies coef3 on col3 channel 2
 lwz	r2,0(r31)		; px
 add	r7 ,r8 ,r10		; Adds col3 to (col1 + col2) channels 1 & 3
 and	r10,r28,r12		; Masks col4 channels 1 & 3 : 0x00XX00XX
 mullw	r10,r10,r24		; Applies coef4 on col4 channels 1 & 3
 add	r25,r25,r29		; Adds col 3 to (col1 + col2) channel 2
 lwz 	r8,0(r30)    		; px2
 rlwinm  r28,r28,0,16,23		; Masks col4 channel 2 : 0x0000XX00
 ;andi.	r28,r28,0xFF00		; Masks col4 channel 2 : 0x0000XX00
 add	r7 ,r7 ,r10		; Adds col4 to (col1 + col2 + col3) channels 1 & 3
 lwz	r10,4(r30)   		; py2
 mullw	r28,r28,r24		; Applies coef4 on col4 channel 2
 srawi	r7, r7, 8		; (sum of channels 1 & 3) >> 8
 lwz	r29,4(r31)              ; py
 add	r25,r25,r28		; Adds col 4 to (col1 + col2 + col3) channel 2
 rlwimi  r7, r25, 24, 16, 23	; (((sum of channels 2) >> 8 ) & 0x0000FF00) | ((sum of channels 1 and 3) & 0xFFFF00FF)
 stw	r7,0(r6)		; Stores the computed pixel
 bdnz	L100			; Iterate again if needed
 b       L300	;goto end	; If not, returns from the function


 ; if out of range
 L400:
 stw	r18,0(r6)
 lwz	r8,0(r30)    ; px2
 lwz	r10,4(r30)   ; py2
 lwz	r2,0(r31)    ; px
 lwz	r29,4(r31)   ; py
 bdnz	L100


 L300:

 ; Restore saved registers and return
 lmw	r17,-60(r1)
 blr
	; PowerPC optimized zoom for Goom
	; © 2001-2003 Guillaume Borios
	; This library is free software; you can redistribute it and/or
	; modify it under the terms of the GNU Library General Public
	; License as published by the Free Software Foundation; either
	; version 2 of the License, or (at your option) any later version.
	;
	; This library is distributed in the hope that it will be useful,
	; but WITHOUT ANY WARRANTY; without even the implied warranty of
	; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	; Library General Public License for more details.
	;
	; You should have received a copy of the GNU Library General Public
	; License along with this library; if not, write to the
	; Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
	; Boston, MA 02110-1301, USA.

	; Change log :
	; 21 Dec 2003 : Use of altivec is now determined with a parameter

	; Section definition : We use a read only section
	.text

	; name of the function to call by C program : ppc_zoom
	; We declare this label as a global to extend its scope outside this file
	.globl _ppc_zoom_generic
	.globl _ppc_zoom_G4

	; Description :
	; This routine dynamically computes and applies a zoom filter

	; parameters :
	; r3 <=> unsigned int sizeX (in pixels)
	; r4 <=> unsigned int sizeY (in pixels)
	; r5 <=> unsigned int * frompixmap
	; r6 <=> unsigned int * topixmap
	; r7 <=> unsigned int * brutS
	; r8 <=> unsigned int * brutD
	; r9 <=> unsigned int buffratio
	; r10 <=> int [16][16] precalccoeffs

	; globals after init
	; r5 <=> frompixmap - 1 byte needed for preincremental fetch (replaces r5)
	; r6 <=> topixmap - 1 byte needed for preincremental fetch (replaces r6)
	; r3 <=> ax = x max in 16th of pixels (replaces old r3)
	; r4 <=> ay = y max in 16th of pixels (replaces old r4)
	; r20 <=> row size in bytes
	; r12 <=> 0xFF00FF (mask for parallel 32 bits pixs computing)
	; r30 <=> brutS - 1 byte needed for preincremental fetch (replaces r7)
	; r31 <=> brutD - 1 byte needed for preincremental fetch (replaces r8)

	; ABI notes :
	; r1 is the Stack Pointer (SP) => Do not use
	; r13..r31 are non-volatiles => Do not use

	_ppc_zoom_generic:

	; Saves the used non volatile registers in the Mach-O stack s Red-Zone
	stmw r18,-56(r1)

	; init
	li r18,0 ; Default value if out of range : 0 (Black)
	mr r11,r10
	lis r12,0xFF
	mullw r2,r3,r4 ; Number of pixels to compute
	subi r30,r8,0
	slwi r20,r3,2
	srawi r19,r20,2
	ori r12,r12,0xFF
	subi r3,r3,1
	subi r4,r4,1
	mtspr ctr,r2 ; Init the loop count (one loop per pixel computed)
	subi r31,r7,0
	subi r6,r6,4
	slwi r3,r3,4
	slwi r4,r4,4

	;pre init for loop
	lwz r2,0(r31) ; px
	lwz r29,4(r31) ; py
	lwz r8,0(r30) ; px2
	lwz r10,4(r30) ; py2

	b L1
	.align 5
	L1:

	; computes dynamically the position to fetch
	sub r8,r8,r2
	sub r10,r10,r29
	mullw r8,r8,r9
	addi r31,r31,8
	mullw r10,r10,r9
	addi r30,r30,8

	srawi r8,r8,16
	srawi r10,r10,16
	add r2,r2,r8
	add r29,r29,r10

	; if px>ax or py>ay goto outofrange
	; computes the attenuation coeffs and the original point address
	rlwinm r10,r2,6,28-6,31-6 ; r10 <- (r2 << 2) & 0x000002D0 (r10=(r2%16)416)
	cmpl cr4,0,r2,r3
	rlwimi r10, r29, 2, 28-2, 31-2 ; r10 <- ((r29 << 2) & 0x0000002D) \| (r10 & !0x0000002D) (r10=(r10%16)*4 \| r10)
	cmpl cr7,0,r29,r4
	srawi r29,r29,4 ; pos computing
	bge- cr4,L4
	srawi r2,r2,4 ; pos computing
	mullw r29, r29,r19 ; pos computing
	bge- cr7,L4

	; Channels notation : 00112233 (AARRVVBB)

	add r2,r2,r29 ; pos computing
	lwzx r10,r11,r10 ; Loads coefs
	slwi r2,r2,2 ; pos computing
	add r2,r2,r5 ; pos computing
	rlwinm r21,r10,0,24,31 ; Isolates coef1 (??????11 -> 00000011)
	lwz r25,0(r2) ; Loads col1 -> r25
	lwz r26,4(r2) ; Loads col2 -> r26
	rlwinm r22,r10,24,24,31 ; Isolates coef2 (????22?? -> 00000022)
	rlwinm r23,r10,16,24,31 ; Isolates coef3 (??33???? -> 00000033)
	add r2,r2,r20 ; Adds one line for future load of col3 and col4
	and r8, r25,r12 ; Masks col1 channels 1 & 3 : 0x00XX00XX
	rlwinm r24,r10,8,24,31 ; Isolates coef4 (44?????? -> 00000044)
	andi. r25,r25,0xFF00 ; Masks col1 channel 2 : 0x0000XX00
	mullw r8, r8, r21 ; Applies coef1 on col1 channels 1 & 3


	; computes final pixel color
	and r10,r26,r12 ; Masks col2 channels 1 & 3 : 0x00XX00XX
	lwz r27,0(r2) ; Loads col3 -> r27
	mullw r10,r10,r22 ; Applies coef2 on col2 channels 1 & 3
	mullw r25,r25,r21 ; Applies coef1 on col1 channel 2
	andi. r29,r26,0xFF00 ; Masks col2 channel 2 : 0x0000XX00
	mullw r29,r29,r22 ; Applies coef2 on col2 channel 2
	lwz r28,4(r2) ; Loads col4 -> r28
	add r8 ,r8 ,r10 ; Adds col1 & col2 channels 1 & 3
	and r10,r27,r12 ; Masks col3 channels 1 & 3 : 0x00XX00XX
	add r25,r25,r29 ; Adds col1 & col2 channel 2
	mullw r10,r10,r23 ; Applies coef3 on col3 channels 1 & 3
	andi. r29,r27,0xFF00 ; Masks col3 channel 2 : 0x0000XX00
	mullw r29,r29,r23 ; Applies coef3 on col3 channel 2
	lwz r2,0(r31) ; px
	add r7 ,r8 ,r10 ; Adds col3 to (col1 + col2) channels 1 & 3
	and r10,r28,r12 ; Masks col4 channels 1 & 3 : 0x00XX00XX
	mullw r10,r10,r24 ; Applies coef4 on col4 channels 1 & 3
	add r25,r25,r29 ; Adds col 3 to (col1 + col2) channel 2
	lwz r8,0(r30) ; px2
	andi. r28,r28,0xFF00 ; Masks col4 channel 2 : 0x0000XX00
	add r7 ,r7 ,r10 ; Adds col4 to (col1 + col2 + col3) channels 1 & 3
	lwz r10,4(r30) ; py2
	mullw r28,r28,r24 ; Applies coef4 on col4 channel 2
	srawi r7, r7, 8 ; (sum of channels 1 & 3) >> 8
	lwz r29,4(r31) ; py
	add r25,r25,r28 ; Adds col 4 to (col1 + col2 + col3) channel 2
	rlwimi r7, r25, 24, 16, 23 ; (((sum of channels 2) >> 8 ) & 0x0000FF00) \| ((sum of channels 1 and 3) & 0xFFFF00FF)
	stwu r7,4(r6) ; Stores the computed pixel
	bdnz L1 ; Iterate again if needed
	b L3 ;goto end ; If not, returns from the function


	; if out of range
	L4:
	stwu r18,4(r6)
	lwz r8,0(r30) ; px2
	lwz r10,4(r30) ; py2
	lwz r2,0(r31) ; px
	lwz r29,4(r31) ; py
	bdnz L1


	L3:

	; Restore saved registers and return
	lmw r18,-56(r1)
	blr








	_ppc_zoom_G4:

	; Saves the used non volatile registers in the Mach-O stack s Red-Zone
	stmw r17,-60(r1)

	; init
	li r18,0 ; Default value if out of range : 0 (Black)
	mr r11,r10
	lis r12,0xFF
	mullw r2,r3,r4 ; Number of pixels to compute
	subi r30,r8,0
	slwi r20,r3,2
	srawi r19,r20,2
	ori r12,r12,0xFF
	subi r3,r3,1
	subi r4,r4,1
	mtspr ctr,r2 ; Init the loop count (one loop per pixel computed)
	subi r31,r7,0
	subi r6,r6,4
	slwi r3,r3,4
	slwi r4,r4,4

	;pre init for loop
	lwz r2,0(r31) ; px
	lwz r29,4(r31) ; py
	lwz r8,0(r30) ; px2
	lwz r10,4(r30) ; py2

	;*********************
	lis r17,0x0F01

	b L100
	.align 5
	L100:

	addi r6,r6,4

	; Optimization to ensure the destination buffer
	; won't be loaded into the data cache
	rlwinm. r0,r6,0,27,31
	bne+ L500
	dcbz 0,r6
	;dcba 0,r6
	L500:

	; computes dynamically the position to fetch
	;mullw r8,r8,r29
	;mullw r2,r2,r29
	;add r2,r8,r2
	;srawi r2,r2,17

	sub r8,r8,r2
	sub r10,r10,r29
	mullw r8,r8,r9
	addi r31,r31,8
	mullw r10,r10,r9
	addi r30,r30,8

	dst r30,r17,0

	srawi r8,r8,16
	srawi r10,r10,16
	add r2,r2,r8
	add r29,r29,r10

	dst r31,r17,1

	; if px>ax or py>ay goto outofrange
	; computes the attenuation coeffs and the original point address
	rlwinm r10,r2,6,28-6,31-6 ; r10 <- (r2 << 2) & 0x000002D0 (r10=(r2%16)416)
	cmpl cr4,0,r2,r3
	rlwimi r10, r29, 2, 28-2, 31-2 ; r10 <- ((r29 << 2) & 0x0000002D) \| (r10 & !0x0000002D) (r10=(r29%16)*4 \| r10)
	cmpl cr7,0,r29,r4
	srawi r29,r29,4 ; pos computing
	bge- cr4,L400
	srawi r2,r2,4 ; pos computing
	mullw r29, r29,r19 ; pos computing
	bge- cr7,L400

	; Channels notation : 00112233 (AARRVVBB)

	add r2,r2,r29 ; pos computing
	lwzx r10,r11,r10 ; Loads coefs
	slwi r2,r2,2 ; pos computing
	add r2,r2,r5 ; pos computing
	rlwinm r21,r10,0,24,31 ; Isolates coef1 (??????11 -> 00000011)
	lwz r25,0(r2) ; Loads col1 -> r25
	lwz r26,4(r2) ; Loads col2 -> r26
	rlwinm r22,r10,24,24,31 ; Isolates coef2 (????22?? -> 00000022)
	rlwinm r23,r10,16,24,31 ; Isolates coef3 (??33???? -> 00000033)
	add r2,r2,r20 ; Adds one line for future load of col3 and col4
	and r8, r25,r12 ; Masks col1 channels 1 & 3 : 0x00XX00XX
	rlwinm r24,r10,8,24,31 ; Isolates coef4 (44?????? -> 00000044)
	dst r2,r17,2
	rlwinm r25,r25,0,16,23 ; Masks col1 channel 2 : 0x0000XX00
	;andi. r25,r25,0xFF00 ; Masks col1 channel 2 : 0x0000XX00
	mullw r8, r8, r21 ; Applies coef1 on col1 channels 1 & 3


	; computes final pixel color
	and r10,r26,r12 ; Masks col2 channels 1 & 3 : 0x00XX00XX
	lwz r27,0(r2) ; Loads col3 -> r27
	mullw r10,r10,r22 ; Applies coef2 on col2 channels 1 & 3
	mullw r25,r25,r21 ; Applies coef1 on col1 channel 2
	rlwinm r29,r26,0,16,23 ; Masks col2 channel 2 : 0x0000XX00
	;andi. r29,r26,0xFF00 ; Masks col2 channel 2 : 0x0000XX00
	mullw r29,r29,r22 ; Applies coef2 on col2 channel 2
	lwz r28,4(r2) ; Loads col4 -> r28
	add r8 ,r8 ,r10 ; Adds col1 & col2 channels 1 & 3
	and r10,r27,r12 ; Masks col3 channels 1 & 3 : 0x00XX00XX
	add r25,r25,r29 ; Adds col1 & col2 channel 2
	mullw r10,r10,r23 ; Applies coef3 on col3 channels 1 & 3
	rlwinm r29,r27,0,16,23 ; Masks col3 channel 2 : 0x0000XX00
	;andi. r29,r27,0xFF00 ; Masks col3 channel 2 : 0x0000XX00
	mullw r29,r29,r23 ; Applies coef3 on col3 channel 2
	lwz r2,0(r31) ; px
	add r7 ,r8 ,r10 ; Adds col3 to (col1 + col2) channels 1 & 3
	and r10,r28,r12 ; Masks col4 channels 1 & 3 : 0x00XX00XX
	mullw r10,r10,r24 ; Applies coef4 on col4 channels 1 & 3
	add r25,r25,r29 ; Adds col 3 to (col1 + col2) channel 2
	lwz r8,0(r30) ; px2
	rlwinm r28,r28,0,16,23 ; Masks col4 channel 2 : 0x0000XX00
	;andi. r28,r28,0xFF00 ; Masks col4 channel 2 : 0x0000XX00
	add r7 ,r7 ,r10 ; Adds col4 to (col1 + col2 + col3) channels 1 & 3
	lwz r10,4(r30) ; py2
	mullw r28,r28,r24 ; Applies coef4 on col4 channel 2
	srawi r7, r7, 8 ; (sum of channels 1 & 3) >> 8
	lwz r29,4(r31) ; py
	add r25,r25,r28 ; Adds col 4 to (col1 + col2 + col3) channel 2
	rlwimi r7, r25, 24, 16, 23 ; (((sum of channels 2) >> 8 ) & 0x0000FF00) \| ((sum of channels 1 and 3) & 0xFFFF00FF)
	stw r7,0(r6) ; Stores the computed pixel
	bdnz L100 ; Iterate again if needed
	b L300 ;goto end ; If not, returns from the function


	; if out of range
	L400:
	stw r18,0(r6)
	lwz r8,0(r30) ; px2
	lwz r10,4(r30) ; py2
	lwz r2,0(r31) ; px
	lwz r29,4(r31) ; py
	bdnz L100


	L300:

	; Restore saved registers and return
	lmw r17,-60(r1)
	blr