libpixelflinger/t32cb16blend.S - android-core - Git at Google

 /* libs/pixelflinger/t32cb16blend.S
 **
 ** Copyright 2006, The Android Open Source Project
 **
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 **
 **     http://www.apache.org/licenses/LICENSE-2.0
 **
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 */


 	.text
 	.syntax unified
 	.align

 	.global scanline_t32cb16blend_arm


 /*
  * .macro pixel
  *
  * \DREG is a 32-bit register containing *two* original destination RGB565
  *       pixels, with the even one in the low-16 bits, and the odd one in the
  *       high 16 bits.
  *
  * \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors.
  *
  * \FB is a target register that will contain the blended pixel values.
  *
  * \ODD is either 0 or 1 and indicates if we're blending the lower or
  *      upper 16-bit pixels in DREG into FB
  *
  *
  * clobbered: r6, r7, lr
  *
  */

 .macro pixel,   DREG, SRC, FB, ODD

     // SRC = 0xAABBGGRR
     mov     r7, \SRC, lsr #24           // sA
     add     r7, r7, r7, lsr #7          // sA + (sA >> 7)
     rsb     r7, r7, #0x100              // sA = 0x100 - (sA+(sA>>7))

 1:

 .if \ODD

     // red
     mov     lr, \DREG, lsr #(16 + 11)
     smulbb  lr, r7, lr
     mov     r6, \SRC, lsr #3
     and     r6, r6, #0x1F
     add     lr, r6, lr, lsr #8
     cmp     lr, #0x1F
     orrhs   \FB, \FB, #(0x1F<<(16 + 11))
     orrlo   \FB, \FB, lr, lsl #(16 + 11)

         // green
         and     r6, \DREG, #(0x3F<<(16 + 5))
         smulbt  r6, r7, r6
         mov     lr, \SRC, lsr #(8+2)
         and     lr, lr, #0x3F
         add     r6, lr, r6, lsr #(5+8)
         cmp     r6, #0x3F
         orrhs   \FB, \FB, #(0x3F<<(16 + 5))
         orrlo   \FB, \FB, r6, lsl #(16 + 5)

             // blue
             and     lr, \DREG, #(0x1F << 16)
             smulbt  lr, r7, lr
             mov     r6, \SRC, lsr #(8+8+3)
             and     r6, r6, #0x1F
             add     lr, r6, lr, lsr #8
             cmp     lr, #0x1F
             orrhs   \FB, \FB, #(0x1F << 16)
             orrlo   \FB, \FB, lr, lsl #16

 .else

     // red
     mov     lr, \DREG, lsr #11
     and     lr, lr, #0x1F
     smulbb  lr, r7, lr
     mov     r6, \SRC, lsr #3
     and     r6, r6, #0x1F
     add     lr, r6, lr, lsr #8
     cmp     lr, #0x1F
     movhs   \FB, #(0x1F<<11)
     movlo   \FB, lr, lsl #11


         // green
         and     r6, \DREG, #(0x3F<<5)
         smulbb  r6, r7, r6
         mov     lr, \SRC, lsr #(8+2)
         and     lr, lr, #0x3F
         add     r6, lr, r6, lsr #(5+8)
         cmp     r6, #0x3F
         orrhs   \FB, \FB, #(0x3F<<5)
         orrlo   \FB, \FB, r6, lsl #5

             // blue
             and     lr, \DREG, #0x1F
             smulbb  lr, r7, lr
             mov     r6, \SRC, lsr #(8+8+3)
             and     r6, r6, #0x1F
             add     lr, r6, lr, lsr #8
             cmp     lr, #0x1F
             orrhs   \FB, \FB, #0x1F
             orrlo   \FB, \FB, lr

 .endif

     .endm


 // r0:  dst ptr
 // r1:  src ptr
 // r2:  count
 // r3:  d
 // r4:  s0
 // r5:  s1
 // r6:  pixel
 // r7:  pixel
 // r8:  free
 // r9:  free
 // r10: free
 // r11: free
 // r12: scratch
 // r14: pixel

 scanline_t32cb16blend_arm:
     stmfd	sp!, {r4-r7, lr}

     pld     [r0]
     pld     [r1]

     // align DST to 32 bits
     tst     r0, #0x3
     beq     aligned
     subs    r2, r2, #1
     ldmfdlo sp!, {r4-r7, lr}        // return
     bxlo    lr

 last:
     ldr     r4, [r1], #4
     ldrh    r3, [r0]
     pixel   r3, r4, r12, 0
     strh    r12, [r0], #2

 aligned:
     subs    r2, r2, #2
     blo     9f

     // The main loop is unrolled twice and processes 4 pixels
 8:  ldmia   r1!, {r4, r5}
     // stream the source
     pld     [r1, #32]
     add     r0, r0, #4
     // it's all zero, skip this pixel
     orrs    r3, r4, r5
     beq     7f

     // load the destination
     ldr     r3, [r0, #-4]
     // stream the destination
     pld     [r0, #32]
     pixel   r3, r4, r12, 0
     pixel   r3, r5, r12, 1
     // effectively, we're getting write-combining by virtue of the
     // cpu's write-back cache.
     str     r12, [r0, #-4]

     // 2nd iterration of the loop, don't stream anything
     subs    r2, r2, #2
     movlt   r4, r5
     blt     9f
     ldmia   r1!, {r4, r5}
     add     r0, r0, #4
     orrs    r3, r4, r5
     beq     7f
     ldr     r3, [r0, #-4]
     pixel   r3, r4, r12, 0
     pixel   r3, r5, r12, 16
     str     r12, [r0, #-4]


 7:  subs    r2, r2, #2
     bhs     8b
     mov     r4, r5

 9:  adds    r2, r2, #1
     ldmfdlo sp!, {r4-r7, lr}        // return
     bxlo    lr
     b       last
	/* libs/pixelflinger/t32cb16blend.S
	**
	** Copyright 2006, The Android Open Source Project
	**
	** Licensed under the Apache License, Version 2.0 (the "License");
	** you may not use this file except in compliance with the License.
	** You may obtain a copy of the License at
	**
	** http://www.apache.org/licenses/LICENSE-2.0
	**
	** Unless required by applicable law or agreed to in writing, software
	** distributed under the License is distributed on an "AS IS" BASIS,
	** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	** See the License for the specific language governing permissions and
	** limitations under the License.
	*/


	.text
	.syntax unified
	.align

	.global scanline_t32cb16blend_arm


	/*
	* .macro pixel
	*
	* \DREG is a 32-bit register containing two original destination RGB565
	* pixels, with the even one in the low-16 bits, and the odd one in the
	* high 16 bits.
	*
	* \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors.
	*
	* \FB is a target register that will contain the blended pixel values.
	*
	* \ODD is either 0 or 1 and indicates if we're blending the lower or
	* upper 16-bit pixels in DREG into FB
	*
	*
	* clobbered: r6, r7, lr
	*
	*/

	.macro pixel, DREG, SRC, FB, ODD

	// SRC = 0xAABBGGRR
	mov r7, \SRC, lsr #24 // sA
	add r7, r7, r7, lsr #7 // sA + (sA >> 7)
	rsb r7, r7, #0x100 // sA = 0x100 - (sA+(sA>>7))

	1:

	.if \ODD

	// red
	mov lr, \DREG, lsr #(16 + 11)
	smulbb lr, r7, lr
	mov r6, \SRC, lsr #3
	and r6, r6, #0x1F
	add lr, r6, lr, lsr #8
	cmp lr, #0x1F
	orrhs \FB, \FB, #(0x1F<<(16 + 11))
	orrlo \FB, \FB, lr, lsl #(16 + 11)

	// green
	and r6, \DREG, #(0x3F<<(16 + 5))
	smulbt r6, r7, r6
	mov lr, \SRC, lsr #(8+2)
	and lr, lr, #0x3F
	add r6, lr, r6, lsr #(5+8)
	cmp r6, #0x3F
	orrhs \FB, \FB, #(0x3F<<(16 + 5))
	orrlo \FB, \FB, r6, lsl #(16 + 5)

	// blue
	and lr, \DREG, #(0x1F << 16)
	smulbt lr, r7, lr
	mov r6, \SRC, lsr #(8+8+3)
	and r6, r6, #0x1F
	add lr, r6, lr, lsr #8
	cmp lr, #0x1F
	orrhs \FB, \FB, #(0x1F << 16)
	orrlo \FB, \FB, lr, lsl #16

	.else

	// red
	mov lr, \DREG, lsr #11
	and lr, lr, #0x1F
	smulbb lr, r7, lr
	mov r6, \SRC, lsr #3
	and r6, r6, #0x1F
	add lr, r6, lr, lsr #8
	cmp lr, #0x1F
	movhs \FB, #(0x1F<<11)
	movlo \FB, lr, lsl #11


	// green
	and r6, \DREG, #(0x3F<<5)
	smulbb r6, r7, r6
	mov lr, \SRC, lsr #(8+2)
	and lr, lr, #0x3F
	add r6, lr, r6, lsr #(5+8)
	cmp r6, #0x3F
	orrhs \FB, \FB, #(0x3F<<5)
	orrlo \FB, \FB, r6, lsl #5

	// blue
	and lr, \DREG, #0x1F
	smulbb lr, r7, lr
	mov r6, \SRC, lsr #(8+8+3)
	and r6, r6, #0x1F
	add lr, r6, lr, lsr #8
	cmp lr, #0x1F
	orrhs \FB, \FB, #0x1F
	orrlo \FB, \FB, lr

	.endif

	.endm


	// r0: dst ptr
	// r1: src ptr
	// r2: count
	// r3: d
	// r4: s0
	// r5: s1
	// r6: pixel
	// r7: pixel
	// r8: free
	// r9: free
	// r10: free
	// r11: free
	// r12: scratch
	// r14: pixel

	scanline_t32cb16blend_arm:
	stmfd sp!, {r4-r7, lr}

	pld [r0]
	pld [r1]

	// align DST to 32 bits
	tst r0, #0x3
	beq aligned
	subs r2, r2, #1
	ldmfdlo sp!, {r4-r7, lr} // return
	bxlo lr

	last:
	ldr r4, [r1], #4
	ldrh r3, [r0]
	pixel r3, r4, r12, 0
	strh r12, [r0], #2

	aligned:
	subs r2, r2, #2
	blo 9f

	// The main loop is unrolled twice and processes 4 pixels
	8: ldmia r1!, {r4, r5}
	// stream the source
	pld [r1, #32]
	add r0, r0, #4
	// it's all zero, skip this pixel
	orrs r3, r4, r5
	beq 7f

	// load the destination
	ldr r3, [r0, #-4]
	// stream the destination
	pld [r0, #32]
	pixel r3, r4, r12, 0
	pixel r3, r5, r12, 1
	// effectively, we're getting write-combining by virtue of the
	// cpu's write-back cache.
	str r12, [r0, #-4]

	// 2nd iterration of the loop, don't stream anything
	subs r2, r2, #2
	movlt r4, r5
	blt 9f
	ldmia r1!, {r4, r5}
	add r0, r0, #4
	orrs r3, r4, r5
	beq 7f
	ldr r3, [r0, #-4]
	pixel r3, r4, r12, 0
	pixel r3, r5, r12, 16
	str r12, [r0, #-4]


	7: subs r2, r2, #2
	bhs 8b
	mov r4, r5

	9: adds r2, r2, #1
	ldmfdlo sp!, {r4-r7, lr} // return
	bxlo lr
	b last