cpu_ref/rsCpuIntrinsics_advsimd_3DLUT.S - platform/frameworks/rs - Git at Google

 /*
  * Copyright (C) 2014 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
 #define END(f) .size f, .-f;


 .macro lanepair dst, src0, src1, xr0, xr1, yr0, yr1, zr0, zr1

             smov        x6, \src0
             smov        x7, \src1

             add         x6, x6, x3
             add         x7, x7, x3

             ld1         {v16.2s}, [x6], x4
             ld1         {v17.2s}, [x7], x4

             ld1         {v18.2s}, [x6], x5
             ld1         {v19.2s}, [x7], x5

             dup         v8.8b, \yr0
             dup         v9.8b, \yr1
             /* Y interpolate, front, lanes 0 and 1 -> v12 and v13 */
             zip1        v12.16b, v5.16b, v16.16b
             zip1        v13.16b, v5.16b, v17.16b
             umlsl       v12.8h, v16.8b, v8.8b
             umlsl       v13.8h, v17.8b, v9.8b
             umlal       v12.8h, v18.8b, v8.8b
             umlal       v13.8h, v19.8b, v9.8b

             ld1         {v18.2s}, [x6]
             ld1         {v19.2s}, [x7]

             sub         x6, x6, x4
             sub         x7, x7, x4

             ld1         {v16.2s}, [x6]
             ld1         {v17.2s}, [x7]

             /* Y interpolate, rear, lanes 0 and 1 -> v14 and v15 */
             zip1        v14.16b, v5.16b, v16.16b
             zip1        v15.16b, v5.16b, v17.16b
             umlsl       v14.8h, v16.8b, v8.8b
             umlsl       v15.8h, v17.8b, v9.8b
             umlal       v14.8h, v18.8b, v8.8b
             umlal       v15.8h, v19.8b, v9.8b

             /* Z interpolate, lane 0 v12/v14 -> v10 */
             ushll       v8.4s, v12.4h, #8
             ushll2      v9.4s, v12.8h, #8
             umlsl       v8.4s, v12.4h, \zr0
             umlsl2      v9.4s, v12.8h, \zr0
             umlal       v8.4s, v14.4h, \zr0
             umlal2      v9.4s, v14.8h, \zr0
             rshrn       v10.4h, v8.4s, #8
             rshrn2      v10.8h, v9.4s, #8

             /* Z interpolate, lane 1 v13/v15 -> v11 */
             ushll       v8.4s, v13.4h, #8
             ushll2      v9.4s, v13.8h, #8
             umlsl       v8.4s, v13.4h, \zr1
             umlsl2      v9.4s, v13.8h, \zr1
             umlal       v8.4s, v15.4h, \zr1
             umlal2      v9.4s, v15.8h, \zr1
             rshrn       v11.4h, v8.4s, #8
             rshrn2      v11.8h, v9.4s, #8

             /* X interpolate, lanes 0 and 1 v10,v11 -> v14 */
             ushll       v8.4s, v10.4h, #8
             ushll       v9.4s, v11.4h, #8
             umlsl       v8.4s, v10.4h, \xr0
             umlsl       v9.4s, v11.4h, \xr1
             umlal2      v8.4s, v10.8h, \xr0
             umlal2      v9.4s, v11.8h, \xr1
             shrn        v14.4h, v8.4s, #8
             shrn2       v14.8h, v9.4s, #8

             /* pack lanes 0-1 -> v6 */
 .ifc \dst, v20.16b
             uqrshrn2    \dst, v14.8h, #8
 .else ; .ifc \dst, v21.16b
             uqrshrn2    \dst, v14.8h, #8
 .else
             uqrshrn     \dst, v14.8h, #8
 .endif ; .endif
 .endm

 /* void rsdIntrinsic3DLUT_K(
  *          void *dst,          // x0
  *          void const *in,     // x1
  *          size_t count,       // x2
  *          void const *lut,    // x3
  *          int32_t pitchy,     // w4
  *          int32_t pitchz,     // w5
  *          int dimx,           // w6
  *          int dimy,           // w7
  *          int dimz);          // [sp]
  */
 ENTRY(rsdIntrinsic3DLUT_K)
             ldr         w8, [sp]
             stp         d8, d9, [sp, #-64]!
             stp         d10, d11, [sp, #16]
             stp         d12, d13, [sp, #32]
             stp         d14, d15, [sp, #48]
             movi        v4.8b, #1
             ins         v4.h[0], w6
             ins         v4.h[1], w7
             ins         v4.h[2], w8
             ins         v4.s[2], w4
             ins         v4.s[3], w5
             movi        v5.16b, #0

             subs        x2, x2, #8
             bge         2f
             cmn         x2, #8    // same as cmp x2, #-8
             ble         9f
             b           4f

             .align 6
 1:          st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
 /* x0  = dst
  * x1  = src
  * x2  = count
  * x3  = lut
  * x4  = pitchy
  * x5  = pitchz
  * x6 = offset0
  * x7 = offset1
  */
 2:          ld4         {v0.8b-v3.8b}, [x1], #32
 /* v0,v1,v2,v3 source data
  * v4 dimensions and pitches
  */
 3:          uxtl        v0.8h, v0.8b
             uxtl        v1.8h, v1.8b
             uxtl        v2.8h, v2.8b
             mul         v0.8h, v0.8h, v4.h[0]
             mul         v1.8h, v1.8h, v4.h[1]
             mul         v2.8h, v2.8h, v4.h[2]

 /* ursra below would be more accurate, but this can result in a dim.0 case
  * where we try to read from the limit of the array and the limit +1 to
  * interpolate, even though the fractional component is zero.  Strictly this is
  * correct, except for the llegal access problem.
  */
             usra        v0.8h, v0.8h, #8
             usra        v1.8h, v1.8h, #8
             usra        v2.8h, v2.8h, #8

             ushr        v12.8h, v0.8h, #8
             ushr        v13.8h, v1.8h, #8
             ushr        v14.8h, v2.8h, #8
             bic         v0.8h, #0xff, LSL #8
             xtn         v1.8b, v1.8h
             bic         v2.8h, #0xff, LSL #8

 /* v0.8h,v1.8b,v2.hb fractional offset
  * v12.8h,v13.8h,v14.8h integer offset
  */

             ushll       v6.4s, v12.4h, #2
             ushll2      v7.4s, v12.8h, #2
             uxtl        v8.4s, v13.4h
             uxtl2       v9.4s, v13.8h
             uxtl        v10.4s, v14.4h
             uxtl2       v11.4s, v14.8h
             mla         v6.4s, v8.4s,  v4.s[2]
             mla         v7.4s, v9.4s,  v4.s[2]
             mla         v6.4s, v10.4s, v4.s[3]
             mla         v7.4s, v11.4s, v4.s[3]

 /* v6,v7 list of table offsets */

         /* lanes 0 and 1 */
             lanepair    dst=v20.8b,  src0=v6.s[0], src1=v6.s[1], xr0=v0.h[0], xr1=v0.h[1], yr0=v1.b[0], yr1=v1.b[1], zr0=v2.h[0], zr1=v2.h[1]

         /* lanes 2 and 3 */
             lanepair    dst=v20.16b, src0=v6.s[2], src1=v6.s[3], xr0=v0.h[2], xr1=v0.h[3], yr0=v1.b[2], yr1=v1.b[3], zr0=v2.h[2], zr1=v2.h[3]

         /* lanes 4 and 5 */
             lanepair    dst=v21.8b,  src0=v7.s[0], src1=v7.s[1], xr0=v0.h[4], xr1=v0.h[5], yr0=v1.b[4], yr1=v1.b[5], zr0=v2.h[4], zr1=v2.h[5]

         /* lanes 6 and 7 */
             lanepair    dst=v21.16b, src0=v7.s[2], src1=v7.s[3], xr0=v0.h[6], xr1=v0.h[7], yr0=v1.b[6], yr1=v1.b[7], zr0=v2.h[6], zr1=v2.h[7]

             uzp1        v6.16b, v20.16b, v21.16b
             uzp2        v7.16b, v20.16b, v21.16b
             uzp1        v20.16b, v6.16b, v7.16b
             uzp2        v22.16b, v6.16b, v7.16b
             mov         v21.d[0], v20.d[1]

             subs        x2, x2, #8
             mov         v23.8b, v3.8b

             bge         1b

             cmn         x2, #8    // same as cmp x2, #-8
             blt         1f

             st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
             beq         9f

             /* fill the vector  with a safe value */
 4:          ld4r        {v0.8b-v3.8b}, [x1]
             tbz         x2, #2, 2f
             ld4         {v0.b-v3.b}[0], [x1], #4
             ld4         {v0.b-v3.b}[1], [x1], #4
             ld4         {v0.b-v3.b}[2], [x1], #4
             ld4         {v0.b-v3.b}[3], [x1], #4
 2:          tbz         x2, #1, 2f
             ld4         {v0.b-v3.b}[4], [x1], #4
             ld4         {v0.b-v3.b}[5], [x1], #4
 2:          tbz         x2, #0, 2f
             ld4         {v0.b-v3.b}[6], [x1], #4
 2:          b           3b

 1:          tst         x2, #4
             beq         2f
             st4         {v20.b-v23.b}[0], [x0], #4
             st4         {v20.b-v23.b}[1], [x0], #4
             st4         {v20.b-v23.b}[2], [x0], #4
             st4         {v20.b-v23.b}[3], [x0], #4
 2:          tst         x2, #2
             beq         2f
             st4         {v20.b-v23.b}[4], [x0], #4
             st4         {v20.b-v23.b}[5], [x0], #4
 2:          tst         x2, #1
             beq         9f
             st4         {v20.b-v23.b}[6], [x0], #4

 9:          ldp         d14, d15, [sp, #48]
             ldp         d12, d13, [sp, #32]
             ldp         d10, d11, [sp, #16]
             ldp         d8, d9, [sp], #64
             ret
 END(rsdIntrinsic3DLUT_K)
	/*
	* Copyright (C) 2014 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
	#define END(f) .size f, .-f;


	.macro lanepair dst, src0, src1, xr0, xr1, yr0, yr1, zr0, zr1

	smov x6, \src0
	smov x7, \src1

	add x6, x6, x3
	add x7, x7, x3

	ld1 {v16.2s}, [x6], x4
	ld1 {v17.2s}, [x7], x4

	ld1 {v18.2s}, [x6], x5
	ld1 {v19.2s}, [x7], x5

	dup v8.8b, \yr0
	dup v9.8b, \yr1
	/* Y interpolate, front, lanes 0 and 1 -> v12 and v13 */
	zip1 v12.16b, v5.16b, v16.16b
	zip1 v13.16b, v5.16b, v17.16b
	umlsl v12.8h, v16.8b, v8.8b
	umlsl v13.8h, v17.8b, v9.8b
	umlal v12.8h, v18.8b, v8.8b
	umlal v13.8h, v19.8b, v9.8b

	ld1 {v18.2s}, [x6]
	ld1 {v19.2s}, [x7]

	sub x6, x6, x4
	sub x7, x7, x4

	ld1 {v16.2s}, [x6]
	ld1 {v17.2s}, [x7]

	/* Y interpolate, rear, lanes 0 and 1 -> v14 and v15 */
	zip1 v14.16b, v5.16b, v16.16b
	zip1 v15.16b, v5.16b, v17.16b
	umlsl v14.8h, v16.8b, v8.8b
	umlsl v15.8h, v17.8b, v9.8b
	umlal v14.8h, v18.8b, v8.8b
	umlal v15.8h, v19.8b, v9.8b

	/* Z interpolate, lane 0 v12/v14 -> v10 */
	ushll v8.4s, v12.4h, #8
	ushll2 v9.4s, v12.8h, #8
	umlsl v8.4s, v12.4h, \zr0
	umlsl2 v9.4s, v12.8h, \zr0
	umlal v8.4s, v14.4h, \zr0
	umlal2 v9.4s, v14.8h, \zr0
	rshrn v10.4h, v8.4s, #8
	rshrn2 v10.8h, v9.4s, #8

	/* Z interpolate, lane 1 v13/v15 -> v11 */
	ushll v8.4s, v13.4h, #8
	ushll2 v9.4s, v13.8h, #8
	umlsl v8.4s, v13.4h, \zr1
	umlsl2 v9.4s, v13.8h, \zr1
	umlal v8.4s, v15.4h, \zr1
	umlal2 v9.4s, v15.8h, \zr1
	rshrn v11.4h, v8.4s, #8
	rshrn2 v11.8h, v9.4s, #8

	/* X interpolate, lanes 0 and 1 v10,v11 -> v14 */
	ushll v8.4s, v10.4h, #8
	ushll v9.4s, v11.4h, #8
	umlsl v8.4s, v10.4h, \xr0
	umlsl v9.4s, v11.4h, \xr1
	umlal2 v8.4s, v10.8h, \xr0
	umlal2 v9.4s, v11.8h, \xr1
	shrn v14.4h, v8.4s, #8
	shrn2 v14.8h, v9.4s, #8

	/* pack lanes 0-1 -> v6 */
	.ifc \dst, v20.16b
	uqrshrn2 \dst, v14.8h, #8
	.else ; .ifc \dst, v21.16b
	uqrshrn2 \dst, v14.8h, #8
	.else
	uqrshrn \dst, v14.8h, #8
	.endif ; .endif
	.endm

	/* void rsdIntrinsic3DLUT_K(
	* void *dst, // x0
	* void const *in, // x1
	* size_t count, // x2
	* void const *lut, // x3
	* int32_t pitchy, // w4
	* int32_t pitchz, // w5
	* int dimx, // w6
	* int dimy, // w7
	* int dimz); // [sp]
	*/
	ENTRY(rsdIntrinsic3DLUT_K)
	ldr w8, [sp]
	stp d8, d9, [sp, #-64]!
	stp d10, d11, [sp, #16]
	stp d12, d13, [sp, #32]
	stp d14, d15, [sp, #48]
	movi v4.8b, #1
	ins v4.h[0], w6
	ins v4.h[1], w7
	ins v4.h[2], w8
	ins v4.s[2], w4
	ins v4.s[3], w5
	movi v5.16b, #0

	subs x2, x2, #8
	bge 2f
	cmn x2, #8 // same as cmp x2, #-8
	ble 9f
	b 4f

	.align 6
	1: st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
	/* x0 = dst
	* x1 = src
	* x2 = count
	* x3 = lut
	* x4 = pitchy
	* x5 = pitchz
	* x6 = offset0
	* x7 = offset1
	*/
	2: ld4 {v0.8b-v3.8b}, [x1], #32
	/* v0,v1,v2,v3 source data
	* v4 dimensions and pitches
	*/
	3: uxtl v0.8h, v0.8b
	uxtl v1.8h, v1.8b
	uxtl v2.8h, v2.8b
	mul v0.8h, v0.8h, v4.h[0]
	mul v1.8h, v1.8h, v4.h[1]
	mul v2.8h, v2.8h, v4.h[2]

	/* ursra below would be more accurate, but this can result in a dim.0 case
	* where we try to read from the limit of the array and the limit +1 to
	* interpolate, even though the fractional component is zero. Strictly this is
	* correct, except for the llegal access problem.
	*/
	usra v0.8h, v0.8h, #8
	usra v1.8h, v1.8h, #8
	usra v2.8h, v2.8h, #8

	ushr v12.8h, v0.8h, #8
	ushr v13.8h, v1.8h, #8
	ushr v14.8h, v2.8h, #8
	bic v0.8h, #0xff, LSL #8
	xtn v1.8b, v1.8h
	bic v2.8h, #0xff, LSL #8

	/* v0.8h,v1.8b,v2.hb fractional offset
	* v12.8h,v13.8h,v14.8h integer offset
	*/

	ushll v6.4s, v12.4h, #2
	ushll2 v7.4s, v12.8h, #2
	uxtl v8.4s, v13.4h
	uxtl2 v9.4s, v13.8h
	uxtl v10.4s, v14.4h
	uxtl2 v11.4s, v14.8h
	mla v6.4s, v8.4s, v4.s[2]
	mla v7.4s, v9.4s, v4.s[2]
	mla v6.4s, v10.4s, v4.s[3]
	mla v7.4s, v11.4s, v4.s[3]

	/* v6,v7 list of table offsets */

	/* lanes 0 and 1 */
	lanepair dst=v20.8b, src0=v6.s[0], src1=v6.s[1], xr0=v0.h[0], xr1=v0.h[1], yr0=v1.b[0], yr1=v1.b[1], zr0=v2.h[0], zr1=v2.h[1]

	/* lanes 2 and 3 */
	lanepair dst=v20.16b, src0=v6.s[2], src1=v6.s[3], xr0=v0.h[2], xr1=v0.h[3], yr0=v1.b[2], yr1=v1.b[3], zr0=v2.h[2], zr1=v2.h[3]

	/* lanes 4 and 5 */
	lanepair dst=v21.8b, src0=v7.s[0], src1=v7.s[1], xr0=v0.h[4], xr1=v0.h[5], yr0=v1.b[4], yr1=v1.b[5], zr0=v2.h[4], zr1=v2.h[5]

	/* lanes 6 and 7 */
	lanepair dst=v21.16b, src0=v7.s[2], src1=v7.s[3], xr0=v0.h[6], xr1=v0.h[7], yr0=v1.b[6], yr1=v1.b[7], zr0=v2.h[6], zr1=v2.h[7]

	uzp1 v6.16b, v20.16b, v21.16b
	uzp2 v7.16b, v20.16b, v21.16b
	uzp1 v20.16b, v6.16b, v7.16b
	uzp2 v22.16b, v6.16b, v7.16b
	mov v21.d[0], v20.d[1]

	subs x2, x2, #8
	mov v23.8b, v3.8b

	bge 1b

	cmn x2, #8 // same as cmp x2, #-8
	blt 1f

	st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
	beq 9f

	/* fill the vector with a safe value */
	4: ld4r {v0.8b-v3.8b}, [x1]
	tbz x2, #2, 2f
	ld4 {v0.b-v3.b}[0], [x1], #4
	ld4 {v0.b-v3.b}[1], [x1], #4
	ld4 {v0.b-v3.b}[2], [x1], #4
	ld4 {v0.b-v3.b}[3], [x1], #4
	2: tbz x2, #1, 2f
	ld4 {v0.b-v3.b}[4], [x1], #4
	ld4 {v0.b-v3.b}[5], [x1], #4
	2: tbz x2, #0, 2f
	ld4 {v0.b-v3.b}[6], [x1], #4
	2: b 3b

	1: tst x2, #4
	beq 2f
	st4 {v20.b-v23.b}[0], [x0], #4
	st4 {v20.b-v23.b}[1], [x0], #4
	st4 {v20.b-v23.b}[2], [x0], #4
	st4 {v20.b-v23.b}[3], [x0], #4
	2: tst x2, #2
	beq 2f
	st4 {v20.b-v23.b}[4], [x0], #4
	st4 {v20.b-v23.b}[5], [x0], #4
	2: tst x2, #1
	beq 9f
	st4 {v20.b-v23.b}[6], [x0], #4

	9: ldp d14, d15, [sp, #48]
	ldp d12, d13, [sp, #32]
	ldp d10, d11, [sp, #16]
	ldp d8, d9, [sp], #64
	ret
	END(rsdIntrinsic3DLUT_K)