libpixelflinger/arch-arm64/t32cb16blend.S - android-core - Git at Google

 /*
  * Copyright (C) 2013 The Android Open Source Project
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *  * Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *  * Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the
  *    distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
     .text
     .align 0

     .global scanline_t32cb16blend_arm64

 /*
  * .macro pixel
  *
  *  This macro alpha blends RGB565 original pixel located in either
  *  top or bottom 16 bits of DREG register with SRC 32 bit pixel value
  *  and writes the result to FB register
  *
  * \DREG is a 32-bit register containing *two* original destination RGB565
  *       pixels, with the even one in the low-16 bits, and the odd one in the
  *       high 16 bits.
  *
  * \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors.
  *
  * \FB is a target register that will contain the blended pixel values.
  *
  * \ODD is either 0 or 1 and indicates if we're blending the lower or
  *      upper 16-bit pixels in DREG into FB
  *
  *
  * clobbered: w6, w7, w16, w17, w18
  *
  */

 .macro pixel,   DREG, SRC, FB, ODD

     // SRC = 0xAABBGGRR
     lsr     w7, \SRC, #24               // sA
     add     w7, w7, w7, lsr #7          // sA + (sA >> 7)
     mov     w6, #0x100
     sub     w7, w6, w7                  // sA = 0x100 - (sA+(sA>>7))

 1:

 .if \ODD //Blending odd pixel present in top 16 bits of DREG register

     // red
     lsr     w16, \DREG, #(16 + 11)
     mul     w16, w7, w16
     lsr     w6, \SRC, #3
     and     w6, w6, #0x1F
     add     w16, w6, w16, lsr #8
     cmp     w16, #0x1F
     orr     w17, \FB, #(0x1F<<(16 + 11))
     orr     w18, \FB, w16, lsl #(16 + 11)
     csel    \FB, w17, w18, hi
         // green
         and     w6, \DREG, #(0x3F<<(16 + 5))
         lsr     w17,w6,#(16+5)
         mul     w6, w7, w17
         lsr     w16, \SRC, #(8+2)
         and     w16, w16, #0x3F
         add     w6, w16, w6, lsr #8
         cmp     w6, #0x3F
         orr     w17, \FB, #(0x3F<<(16 + 5))
         orr     w18, \FB, w6, lsl #(16 + 5)
         csel    \FB, w17, w18, hi
             // blue
             and     w16, \DREG, #(0x1F << 16)
             lsr     w17,w16,#16
             mul     w16, w7, w17
             lsr     w6, \SRC, #(8+8+3)
             and     w6, w6, #0x1F
             add     w16, w6, w16, lsr #8
             cmp     w16, #0x1F
             orr     w17, \FB, #(0x1F << 16)
             orr     w18, \FB, w16, lsl #16
             csel    \FB, w17, w18, hi

 .else //Blending even pixel present in bottom 16 bits of DREG register

     // red
     lsr     w16, \DREG, #11
     and     w16, w16, #0x1F
     mul     w16, w7, w16
     lsr     w6, \SRC, #3
     and     w6, w6, #0x1F
     add     w16, w6, w16, lsr #8
     cmp     w16, #0x1F
     mov     w17, #(0x1F<<11)
     lsl     w18, w16, #11
     csel    \FB, w17, w18, hi


         // green
         and     w6, \DREG, #(0x3F<<5)
         mul     w6, w7, w6
         lsr     w16, \SRC, #(8+2)
         and     w16, w16, #0x3F
         add     w6, w16, w6, lsr #(5+8)
         cmp     w6, #0x3F
         orr     w17, \FB, #(0x3F<<5)
         orr     w18, \FB, w6, lsl #5
         csel    \FB, w17, w18, hi

             // blue
             and     w16, \DREG, #0x1F
             mul     w16, w7, w16
             lsr     w6, \SRC, #(8+8+3)
             and     w6, w6, #0x1F
             add     w16, w6, w16, lsr #8
             cmp     w16, #0x1F
             orr     w17, \FB, #0x1F
             orr     w18, \FB, w16
             csel    \FB, w17, w18, hi

 .endif // End of blending even pixel

 .endm // End of pixel macro


 // x0:  dst ptr
 // x1:  src ptr
 // w2:  count
 // w3:  d
 // w4:  s0
 // w5:  s1
 // w6:  pixel
 // w7:  pixel
 // w8:  free
 // w9:  free
 // w10: free
 // w11: free
 // w12: scratch
 // w14: pixel

 scanline_t32cb16blend_arm64:

     // align DST to 32 bits
     tst     x0, #0x3
     b.eq    aligned
     subs    w2, w2, #1
     b.lo    return

 last:
     ldr     w4, [x1], #4
     ldrh    w3, [x0]
     pixel   w3, w4, w12, 0
     strh    w12, [x0], #2

 aligned:
     subs    w2, w2, #2
     b.lo    9f

     // The main loop is unrolled twice and processes 4 pixels
 8:
     ldp   w4,w5, [x1], #8
     add     x0, x0, #4
     // it's all zero, skip this pixel
     orr     w3, w4, w5
     cbz     w3, 7f

     // load the destination
     ldr     w3, [x0, #-4]
     // stream the destination
     pixel   w3, w4, w12, 0
     pixel   w3, w5, w12, 1
     str     w12, [x0, #-4]

     // 2nd iteration of the loop, don't stream anything
     subs    w2, w2, #2
     csel    w4, w5, w4, lt
     blt     9f
     ldp     w4,w5, [x1], #8
     add     x0, x0, #4
     orr     w3, w4, w5
     cbz     w3, 7f
     ldr     w3, [x0, #-4]
     pixel   w3, w4, w12, 0
     pixel   w3, w5, w12, 1
     str     w12, [x0, #-4]

 7:  subs    w2, w2, #2
     bhs     8b
     mov     w4, w5

 9:  adds    w2, w2, #1
     b.lo    return
     b       last

 return:
     ret
	/*
	* Copyright (C) 2013 The Android Open Source Project
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the
	* distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/
	.text
	.align 0

	.global scanline_t32cb16blend_arm64

	/*
	* .macro pixel
	*
	* This macro alpha blends RGB565 original pixel located in either
	* top or bottom 16 bits of DREG register with SRC 32 bit pixel value
	* and writes the result to FB register
	*
	* \DREG is a 32-bit register containing two original destination RGB565
	* pixels, with the even one in the low-16 bits, and the odd one in the
	* high 16 bits.
	*
	* \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors.
	*
	* \FB is a target register that will contain the blended pixel values.
	*
	* \ODD is either 0 or 1 and indicates if we're blending the lower or
	* upper 16-bit pixels in DREG into FB
	*
	*
	* clobbered: w6, w7, w16, w17, w18
	*
	*/

	.macro pixel, DREG, SRC, FB, ODD

	// SRC = 0xAABBGGRR
	lsr w7, \SRC, #24 // sA
	add w7, w7, w7, lsr #7 // sA + (sA >> 7)
	mov w6, #0x100
	sub w7, w6, w7 // sA = 0x100 - (sA+(sA>>7))

	1:

	.if \ODD //Blending odd pixel present in top 16 bits of DREG register

	// red
	lsr w16, \DREG, #(16 + 11)
	mul w16, w7, w16
	lsr w6, \SRC, #3
	and w6, w6, #0x1F
	add w16, w6, w16, lsr #8
	cmp w16, #0x1F
	orr w17, \FB, #(0x1F<<(16 + 11))
	orr w18, \FB, w16, lsl #(16 + 11)
	csel \FB, w17, w18, hi
	// green
	and w6, \DREG, #(0x3F<<(16 + 5))
	lsr w17,w6,#(16+5)
	mul w6, w7, w17
	lsr w16, \SRC, #(8+2)
	and w16, w16, #0x3F
	add w6, w16, w6, lsr #8
	cmp w6, #0x3F
	orr w17, \FB, #(0x3F<<(16 + 5))
	orr w18, \FB, w6, lsl #(16 + 5)
	csel \FB, w17, w18, hi
	// blue
	and w16, \DREG, #(0x1F << 16)
	lsr w17,w16,#16
	mul w16, w7, w17
	lsr w6, \SRC, #(8+8+3)
	and w6, w6, #0x1F
	add w16, w6, w16, lsr #8
	cmp w16, #0x1F
	orr w17, \FB, #(0x1F << 16)
	orr w18, \FB, w16, lsl #16
	csel \FB, w17, w18, hi

	.else //Blending even pixel present in bottom 16 bits of DREG register

	// red
	lsr w16, \DREG, #11
	and w16, w16, #0x1F
	mul w16, w7, w16
	lsr w6, \SRC, #3
	and w6, w6, #0x1F
	add w16, w6, w16, lsr #8
	cmp w16, #0x1F
	mov w17, #(0x1F<<11)
	lsl w18, w16, #11
	csel \FB, w17, w18, hi


	// green
	and w6, \DREG, #(0x3F<<5)
	mul w6, w7, w6
	lsr w16, \SRC, #(8+2)
	and w16, w16, #0x3F
	add w6, w16, w6, lsr #(5+8)
	cmp w6, #0x3F
	orr w17, \FB, #(0x3F<<5)
	orr w18, \FB, w6, lsl #5
	csel \FB, w17, w18, hi

	// blue
	and w16, \DREG, #0x1F
	mul w16, w7, w16
	lsr w6, \SRC, #(8+8+3)
	and w6, w6, #0x1F
	add w16, w6, w16, lsr #8
	cmp w16, #0x1F
	orr w17, \FB, #0x1F
	orr w18, \FB, w16
	csel \FB, w17, w18, hi

	.endif // End of blending even pixel

	.endm // End of pixel macro


	// x0: dst ptr
	// x1: src ptr
	// w2: count
	// w3: d
	// w4: s0
	// w5: s1
	// w6: pixel
	// w7: pixel
	// w8: free
	// w9: free
	// w10: free
	// w11: free
	// w12: scratch
	// w14: pixel

	scanline_t32cb16blend_arm64:

	// align DST to 32 bits
	tst x0, #0x3
	b.eq aligned
	subs w2, w2, #1
	b.lo return

	last:
	ldr w4, [x1], #4
	ldrh w3, [x0]
	pixel w3, w4, w12, 0
	strh w12, [x0], #2

	aligned:
	subs w2, w2, #2
	b.lo 9f

	// The main loop is unrolled twice and processes 4 pixels
	8:
	ldp w4,w5, [x1], #8
	add x0, x0, #4
	// it's all zero, skip this pixel
	orr w3, w4, w5
	cbz w3, 7f

	// load the destination
	ldr w3, [x0, #-4]
	// stream the destination
	pixel w3, w4, w12, 0
	pixel w3, w5, w12, 1
	str w12, [x0, #-4]

	// 2nd iteration of the loop, don't stream anything
	subs w2, w2, #2
	csel w4, w5, w4, lt
	blt 9f
	ldp w4,w5, [x1], #8
	add x0, x0, #4
	orr w3, w4, w5
	cbz w3, 7f
	ldr w3, [x0, #-4]
	pixel w3, w4, w12, 0
	pixel w3, w5, w12, 1
	str w12, [x0, #-4]

	7: subs w2, w2, #2
	bhs 8b
	mov w4, w5

	9: adds w2, w2, #1
	b.lo return
	b last

	return:
	ret