blob: 3fcfa2533ff992c8d953b8692061f6094b309ccd [file] [log] [blame]
/*
* Copyright (C) 2014 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
#define END(f) .size f, .-f;
.macro vmxx_f32 i, mask, opd, opa, opb
.if (\i) & \mask
.if (\i) & (\mask - 1)
fmla \opd, \opa, \opb
.else
fmul \opd, \opa, \opb
.endif
.endif
.endm
.macro vadd_f32 i, mask, opd, opa, opb, stupidsyntax1, stupidsyntax2
.if (\i) & \mask
.if (\i) & (\mask - 1)
fadd \opd, \opa, \opb
.else
mov \stupidsyntax1, \stupidsyntax2
.endif
.endif
.endm
.macro vmxx_s16 i, mask, opd, opa, opb
.if (\i) & \mask
.if (\i) & (\mask - 1 + 16)
smlal \opd, \opa, \opb
.else
smull \opd, \opa, \opb
.endif
.endif
.endm
.macro vmxx2_s16 i, mask, opd, opa, opb
.if (\i) & \mask
.if (\i) & (\mask - 1 + 16)
smlal2 \opd, \opa, \opb
.else
smull2 \opd, \opa, \opb
.endif
.endif
.endm
/* x0 = dst
* x1 = src
* x2 = count
* x3 = params
* x4 = column0_fn
* x5 = column1_fn
* x6 = column2_fn
* x7 = column3_fn
* x8 = store_fn
* x9 = load_fn
*/
.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
.align 6
colormatrix_int_col0_\i:
.if \i & 16
dup v6.4s, v4.s[0]
dup v7.4s, v4.s[0]
.endif
vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[0]
vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[4]
vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[0]
vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[4]
vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[0]
vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[4]
vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[0]
vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[4]
sqshrun v8.4h, v6.4s, #8
sqshrun2 v8.8h, v7.4s, #8
br x5
colormatrix_int_col0_n\i:
.if (\i^31) & 16
dup v6.4s, v4.s[0]
dup v7.4s, v4.s[0]
.endif
vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[0]
vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[4]
vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[0]
vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[4]
vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[0]
vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[4]
vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[0]
vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[4]
sqshrun v8.4h, v6.4s, #8
sqshrun2 v8.8h, v7.4s, #8
br x5
.align 6
colormatrix_int_col1_\i:
.if \i & 16
dup v6.4s, v4.s[1]
dup v7.4s, v4.s[1]
.endif
vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[1]
vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[5]
vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[1]
vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[5]
vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[1]
vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[5]
vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[1]
vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[5]
sqshrun v9.4h, v6.4s, #8
sqshrun2 v9.8h, v7.4s, #8
br x6
colormatrix_int_col1_n\i:
.if (\i^31) & 16
dup v6.4s, v4.s[1]
dup v7.4s, v4.s[1]
.endif
vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[1]
vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[5]
vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[1]
vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[5]
vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[1]
vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[5]
vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[1]
vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[5]
sqshrun v9.4h, v6.4s, #8
sqshrun2 v9.8h, v7.4s, #8
br x6
.align 6
colormatrix_int_col2_\i:
.if \i & 16
dup v6.4s, v4.s[2]
dup v7.4s, v4.s[2]
.endif
vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[2]
vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[6]
vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[2]
vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[6]
vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[2]
vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[6]
vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[2]
vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[6]
sqshrun v10.4h, v6.4s, #8
sqshrun2 v10.8h, v7.4s, #8
br x7
colormatrix_int_col2_n\i:
.if (\i^31) & 16
dup v6.4s, v4.s[2]
dup v7.4s, v4.s[2]
.endif
vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[2]
vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[6]
vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[2]
vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[6]
vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[2]
vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[6]
vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[2]
vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[6]
sqshrun v10.4h, v6.4s, #8
sqshrun2 v10.8h, v7.4s, #8
br x7
.align 6
colormatrix_int_col3_\i:
.if \i & 16
dup v6.4s, v4.s[3]
dup v7.4s, v4.s[3]
.endif
vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[3]
vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[7]
vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[3]
vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[7]
vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[3]
vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[7]
vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[3]
vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[7]
sqshrun v11.4h, v6.4s, #8
sqshrun2 v11.8h, v7.4s, #8
br x8
colormatrix_int_col3_n\i:
.if (\i^31) & 16
dup v6.4s, v4.s[3]
dup v7.4s, v4.s[3]
.endif
vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[3]
vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[7]
vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[3]
vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[7]
vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[3]
vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[7]
vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[3]
vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[7]
sqshrun v11.4h, v6.4s, #8
sqshrun2 v11.8h, v7.4s, #8
br x8
.align 5
colormatrix_float_col0_\i:
vmxx_f32 \i, 1, v8.4s, v12.4s, v0.s[0]
vmxx_f32 \i, 2, v8.4s, v13.4s, v1.s[0]
vmxx_f32 \i, 4, v8.4s, v14.4s, v2.s[0]
vmxx_f32 \i, 8, v8.4s, v15.4s, v3.s[0]
vadd_f32 \i, 16, v8.4s, v8.4s, v4.4s, v8.16b, v4.16b
vmxx_f32 \i, 1, v16.4s, v20.4s, v0.s[0]
vmxx_f32 \i, 2, v16.4s, v21.4s, v1.s[0]
vmxx_f32 \i, 4, v16.4s, v22.4s, v2.s[0]
vmxx_f32 \i, 8, v16.4s, v23.4s, v3.s[0]
vadd_f32 \i, 16, v16.4s, v16.4s, v4.4s, v16.16b, v4.16b
br x5
.align 4
colormatrix_float_col0_n\i:
vmxx_f32 \i^31, 1, v8.4s, v12.4s, v0.s[0]
vmxx_f32 \i^31, 2, v8.4s, v13.4s, v1.s[0]
vmxx_f32 \i^31, 4, v8.4s, v14.4s, v2.s[0]
vmxx_f32 \i^31, 8, v8.4s, v15.4s, v3.s[0]
vadd_f32 \i^31, 16, v8.4s, v8.4s, v4.4s, v8.16b, v4.16b
vmxx_f32 \i^31, 1, v16.4s, v20.4s, v0.s[0]
vmxx_f32 \i^31, 2, v16.4s, v21.4s, v1.s[0]
vmxx_f32 \i^31, 4, v16.4s, v22.4s, v2.s[0]
vmxx_f32 \i^31, 8, v16.4s, v23.4s, v3.s[0]
vadd_f32 \i^31, 16, v16.4s, v16.4s, v4.4s, v16.16b, v4.16b
br x5
.align 5
colormatrix_float_col1_\i:
vmxx_f32 \i, 1, v9.4s, v12.4s, v0.s[1]
vmxx_f32 \i, 2, v9.4s, v13.4s, v1.s[1]
vmxx_f32 \i, 4, v9.4s, v14.4s, v2.s[1]
vmxx_f32 \i, 8, v9.4s, v15.4s, v3.s[1]
vadd_f32 \i, 16, v9.4s, v9.4s, v5.4s, v9.16b, v5.16b
vmxx_f32 \i, 1, v17.4s, v20.4s, v0.s[1]
vmxx_f32 \i, 2, v17.4s, v21.4s, v1.s[1]
vmxx_f32 \i, 4, v17.4s, v22.4s, v2.s[1]
vmxx_f32 \i, 8, v17.4s, v23.4s, v3.s[1]
vadd_f32 \i, 16, v17.4s, v17.4s, v5.4s, v17.16b, v5.16b
br x6
.align 4
colormatrix_float_col1_n\i:
vmxx_f32 \i^31, 1, v9.4s, v12.4s, v0.s[1]
vmxx_f32 \i^31, 2, v9.4s, v13.4s, v1.s[1]
vmxx_f32 \i^31, 4, v9.4s, v14.4s, v2.s[1]
vmxx_f32 \i^31, 8, v9.4s, v15.4s, v3.s[1]
vadd_f32 \i^31, 16, v9.4s, v9.4s, v5.4s, v9.16b, v5.16b
vmxx_f32 \i^31, 1, v17.4s, v20.4s, v0.s[1]
vmxx_f32 \i^31, 2, v17.4s, v21.4s, v1.s[1]
vmxx_f32 \i^31, 4, v17.4s, v22.4s, v2.s[1]
vmxx_f32 \i^31, 8, v17.4s, v23.4s, v3.s[1]
vadd_f32 \i^31, 16, v17.4s, v17.4s, v5.4s, v17.16b, v5.16b
br x6
.align 5
colormatrix_float_col2_\i:
vmxx_f32 \i, 1, v10.4s, v12.4s, v0.s[2]
vmxx_f32 \i, 2, v10.4s, v13.4s, v1.s[2]
vmxx_f32 \i, 4, v10.4s, v14.4s, v2.s[2]
vmxx_f32 \i, 8, v10.4s, v15.4s, v3.s[2]
vadd_f32 \i, 16, v10.4s, v10.4s, v6.4s, v10.16b, v6.16b
vmxx_f32 \i, 1, v18.4s, v20.4s, v0.s[2]
vmxx_f32 \i, 2, v18.4s, v21.4s, v1.s[2]
vmxx_f32 \i, 4, v18.4s, v22.4s, v2.s[2]
vmxx_f32 \i, 8, v18.4s, v23.4s, v3.s[2]
vadd_f32 \i, 16, v18.4s, v18.4s, v6.4s, v18.16b, v6.16b
br x7
.align 4
colormatrix_float_col2_n\i:
vmxx_f32 \i^31, 1, v10.4s, v12.4s, v0.s[2]
vmxx_f32 \i^31, 2, v10.4s, v13.4s, v1.s[2]
vmxx_f32 \i^31, 4, v10.4s, v14.4s, v2.s[2]
vmxx_f32 \i^31, 8, v10.4s, v15.4s, v3.s[2]
vadd_f32 \i^31, 16, v10.4s, v10.4s, v6.4s, v10.16b, v6.16b
vmxx_f32 \i^31, 1, v18.4s, v20.4s, v0.s[2]
vmxx_f32 \i^31, 2, v18.4s, v21.4s, v1.s[2]
vmxx_f32 \i^31, 4, v18.4s, v22.4s, v2.s[2]
vmxx_f32 \i^31, 8, v18.4s, v23.4s, v3.s[2]
vadd_f32 \i^31, 16, v18.4s, v18.4s, v6.4s, v18.16b, v6.16b
br x7
.align 5
colormatrix_float_col3_\i:
vmxx_f32 \i, 1, v11.4s, v12.4s, v0.s[3]
vmxx_f32 \i, 2, v11.4s, v13.4s, v1.s[3]
vmxx_f32 \i, 4, v11.4s, v14.4s, v2.s[3]
vmxx_f32 \i, 8, v11.4s, v15.4s, v3.s[3]
vadd_f32 \i, 16, v11.4s, v11.4s, v7.4s, v11.16b, v7.16b
vmxx_f32 \i, 1, v19.4s, v20.4s, v0.s[3]
vmxx_f32 \i, 2, v19.4s, v21.4s, v1.s[3]
vmxx_f32 \i, 4, v19.4s, v22.4s, v2.s[3]
vmxx_f32 \i, 8, v19.4s, v23.4s, v3.s[3]
vadd_f32 \i, 16, v19.4s, v19.4s, v7.4s, v19.16b, v7.16b
br x8
.align 4
colormatrix_float_col3_n\i:
vmxx_f32 \i^31, 1, v11.4s, v12.4s, v0.s[3]
vmxx_f32 \i^31, 2, v11.4s, v13.4s, v1.s[3]
vmxx_f32 \i^31, 4, v11.4s, v14.4s, v2.s[3]
vmxx_f32 \i^31, 8, v11.4s, v15.4s, v3.s[3]
vadd_f32 \i^31, 16, v11.4s, v11.4s, v7.4s, v11.16b, v7.16b
vmxx_f32 \i^31, 1, v19.4s, v20.4s, v0.s[3]
vmxx_f32 \i^31, 2, v19.4s, v21.4s, v1.s[3]
vmxx_f32 \i^31, 4, v19.4s, v22.4s, v2.s[3]
vmxx_f32 \i^31, 8, v19.4s, v23.4s, v3.s[3]
vadd_f32 \i^31, 16, v19.4s, v19.4s, v7.4s, v19.16b, v7.16b
br x8
.endr
.align 6
colormatrix_float_ldu4:
ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
uxtl v20.8h, v20.8b
uxtl v21.8h, v21.8b
uxtl v22.8h, v22.8b
uxtl v23.8h, v23.8b
uxtl v12.4s, v20.4h
uxtl v13.4s, v21.4h
uxtl v14.4s, v22.4h
uxtl v15.4s, v23.4h
uxtl2 v20.4s, v20.8h
uxtl2 v21.4s, v21.8h
uxtl2 v22.4s, v22.8h
uxtl2 v23.4s, v23.8h
ucvtf v12.4s, v12.4s
ucvtf v13.4s, v13.4s
ucvtf v14.4s, v14.4s
ucvtf v15.4s, v15.4s
ucvtf v20.4s, v20.4s
ucvtf v21.4s, v21.4s
ucvtf v22.4s, v22.4s
ucvtf v23.4s, v23.4s
br x4
.align 5
colormatrix_int_ldu4:
ld4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
uxtl v12.8h, v12.8b
uxtl v13.8h, v13.8b
uxtl v14.8h, v14.8b
uxtl v15.8h, v15.8b
br x4
.align 6
colormatrix_float_ldu3:
ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
uxtl v20.8h, v20.8b
uxtl v21.8h, v21.8b
uxtl v22.8h, v22.8b
uxtl v12.4s, v20.4h
uxtl v13.4s, v21.4h
uxtl v14.4s, v22.4h
uxtl2 v20.4s, v20.8h
uxtl2 v21.4s, v21.8h
uxtl2 v22.4s, v22.8h
ucvtf v12.4s, v12.4s
ucvtf v13.4s, v13.4s
ucvtf v14.4s, v14.4s
ucvtf v20.4s, v20.4s
ucvtf v21.4s, v21.4s
ucvtf v22.4s, v22.4s
br x4
colormatrix_int_ldu3:
ld4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
uxtl v12.8h, v12.8b
uxtl v13.8h, v13.8b
uxtl v14.8h, v14.8b
br x4
.align 5
colormatrix_float_ldu1:
ld1 {v20.8b}, [x1], #8
uxtl v20.8h, v20.8b
uxtl v12.4s, v20.4h
uxtl2 v20.4s, v20.8h
ucvtf v12.4s, v12.4s
ucvtf v20.4s, v20.4s
br x4
.align 6
colormatrix_float_ldu2:
ld2 {v20.8b,v21.8b}, [x1], #16
uxtl v20.8h, v20.8b
uxtl v21.8h, v21.8b
uxtl v12.4s, v20.4h
uxtl v13.4s, v21.4h
uxtl2 v20.4s, v20.8h
uxtl2 v21.4s, v21.8h
ucvtf v12.4s, v12.4s
ucvtf v13.4s, v13.4s
ucvtf v20.4s, v20.4s
ucvtf v21.4s, v21.4s
br x4
.align 4
colormatrix_int_ldu2:
ld2 {v12.8b,v13.8b}, [x1], #16
uxtl v12.8h, v12.8b
uxtl v13.8h, v13.8b
br x4
.align 6
colormatrix_float_stu4:
fcvtzs v24.4s, v8.4s, #1
fcvtzs v25.4s, v9.4s, #1
fcvtzs v26.4s, v10.4s, #1
fcvtzs v27.4s, v11.4s, #1
fcvtzs v28.4s, v16.4s, #1
fcvtzs v29.4s, v17.4s, #1
fcvtzs v30.4s, v18.4s, #1
fcvtzs v31.4s, v19.4s, #1
sqrshrun v24.4h, v24.4s, #1
sqrshrun v25.4h, v25.4s, #1
sqrshrun v26.4h, v26.4s, #1
sqrshrun v27.4h, v27.4s, #1
sqrshrun2 v24.8h, v28.4s, #1
sqrshrun2 v25.8h, v29.4s, #1
sqrshrun2 v26.8h, v30.4s, #1
sqrshrun2 v27.8h, v31.4s, #1
uqxtn v24.8b, v24.8h
uqxtn v25.8b, v25.8h
uqxtn v26.8b, v26.8h
uqxtn v27.8b, v27.8h
subs x2, x2, #8
st4 {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
blo colormatrix_float_end
br x9
.align 5
colormatrix_int_stu4:
uqxtn v12.8b, v8.8h
uqxtn v13.8b, v9.8h
uqxtn v14.8b, v10.8h
uqxtn v15.8b, v11.8h
subs x2, x2, #8
st4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
blo colormatrix_int_end
br x9
.align 6
colormatrix_float_stu3:
fcvtzs v24.4s, v8.4s, #1
fcvtzs v25.4s, v9.4s, #1
fcvtzs v26.4s, v10.4s, #1
fcvtzs v28.4s, v16.4s, #1
fcvtzs v29.4s, v17.4s, #1
fcvtzs v30.4s, v18.4s, #1
sqrshrun v24.4h, v24.4s, #1
sqrshrun v25.4h, v25.4s, #1
sqrshrun v26.4h, v26.4s, #1
sqrshrun2 v24.8h, v28.4s, #1
sqrshrun2 v25.8h, v29.4s, #1
sqrshrun2 v26.8h, v30.4s, #1
uqxtn v24.8b, v24.8h
uqxtn v25.8b, v25.8h
uqxtn v26.8b, v26.8h
movi v27.8b, #0
subs x2, x2, #8
st4 {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
blo colormatrix_float_end
br x9
.align 4
colormatrix_int_ldu1:
ld1 {v12.8b}, [x1], #8
uxtl v12.8h, v12.8b
br x4
.align 5
colormatrix_int_stu3:
uqxtn v12.8b, v8.8h
uqxtn v13.8b, v9.8h
uqxtn v14.8b, v10.8h
movi v15.8b, #0
subs x2, x2, #8
st4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
blo colormatrix_int_end
br x9
.align 6
colormatrix_float_stu2:
fcvtzs v24.4s, v8.4s, #1
fcvtzs v25.4s, v9.4s, #1
fcvtzs v28.4s, v16.4s, #1
fcvtzs v29.4s, v17.4s, #1
sqrshrun v24.4h, v24.4s, #1
sqrshrun v25.4h, v25.4s, #1
sqrshrun2 v24.8h, v28.4s, #1
sqrshrun2 v25.8h, v29.4s, #1
uqxtn v24.8b, v24.8h
uqxtn v25.8b, v25.8h
subs x2, x2, #8
st2 {v24.8b,v25.8b}, [x0], #16
blo colormatrix_float_end
br x9
.align 5
colormatrix_int_stu2:
uqxtn v12.8b, v8.8h
uqxtn v13.8b, v9.8h
subs x2, x2, #8
st2 {v12.8b,v13.8b}, [x0], #16
blo colormatrix_int_end
br x9
.align 5
colormatrix_int_stu1:
uqxtn v12.8b, v8.8h
subs x2, x2, #8
st1 {v12.8b}, [x0], #8
blo colormatrix_int_end
br x9
colormatrix_float_ldf3:
ld4 {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
ld4 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
br x4
.align 6
colormatrix_float_stu1:
fcvtzs v24.4s, v8.4s, #1
fcvtzs v28.4s, v16.4s, #1
sqrshrun v24.4h, v24.4s, #1
sqrshrun2 v24.8h, v28.4s, #1
uqxtn v24.8b, v24.8h
subs x2, x2, #8
st1 {v24.8b}, [x0], #8
blo colormatrix_float_end
br x9
colormatrix_float_stf3:
movi v11.16b, #0
st4 {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
movi v19.16b, #0
subs x2, x2, #8
st4 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
blo colormatrix_float_end
br x9
.align 5
colormatrix_float_stf4:
st4 {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
subs x2, x2, #8
st4 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
blo colormatrix_float_end
br x9
colormatrix_float_ldf4:
ld4 {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
ld4 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
br x4
.align 5
colormatrix_float_stf2:
st2 {v8.4s, v9.4s}, [x0], #32
subs x2, x2, #8
st2 {v16.4s, v17.4s}, [x0], #32
blo colormatrix_float_end
br x9
colormatrix_float_ldf2:
ld2 {v12.4s,v13.4s}, [x1], #32
ld2 {v20.4s,v21.4s}, [x1], #32
br x4
.align 5
colormatrix_float_stf1:
st1 {v8.4s}, [x0], #16
subs x2, x2, #8
st1 {v16.4s}, [x0], #16
blo colormatrix_float_end
br x9
colormatrix_float_ldf1:
ld1 {v12.4s}, [x1], #16
ld1 {v20.4s}, [x1], #16
br x4
colormatrix_int_stu1_end:
uqxtn v12.8b, v8.8h
tbz x2, #2, 1f
st1 {v12.s}[1], [x0], #4
1: tbz x2, #1, 1f
st1 {v12.h}[1], [x0], #2
1: tbz x2, #0, 1f
st1 {v12.b}[1], [x0], #1
1: b colormatrix_int_realend
colormatrix_int_stu2_end:
uqxtn v12.8b, v8.8h
uqxtn v13.8b, v9.8h
zip1 v12.16b, v12.16b, v13.16b
tbz x2, #2, 1f
st1 {v12.d}[1], [x0], #8
1: tbz x2, #1, 1f
st1 {v12.s}[1], [x0], #4
1: tbz x2, #0, 1f
st1 {v12.h}[1], [x0], #2
1: b colormatrix_int_realend
colormatrix_int_stu3_end:
uqxtn v12.8b, v8.8h
uqxtn v13.8b, v9.8h
uqxtn v14.8b, v10.8h
movi v15.8b, #0
tbz x2, #2, 1f
st4 {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
st4 {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
st4 {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
st4 {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
1: tbz x2, #1, 1f
st4 {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
st4 {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
1: tbz x2, #0, 1f
st4 {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
1: b colormatrix_int_realend
colormatrix_int_stu4_end:
uqxtn v12.8b, v8.8h
uqxtn v13.8b, v9.8h
uqxtn v14.8b, v10.8h
uqxtn v15.8b, v11.8h
tbz x2, #2, 1f
st4 {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
st4 {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
st4 {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
st4 {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
1: tbz x2, #1, 1f
st4 {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
st4 {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
1: tbz x2, #0, 1f
st4 {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
1: b colormatrix_int_realend
colormatrix_int_ldu1_end:
tbz x2, #2, 1f
ld1 {v15.s}[3], [x1], #4
1: tbz x2, #1, 1f
ld1 {v15.h}[5], [x1], #2
1: tbz x2, #0, 1f
ld1 {v15.b}[9], [x1], #1
1: uxtl2 v12.8h, v15.16b
br x4
colormatrix_int_ldu2_end:
tbz x2, #2, 1f
ld1 {v15.d}[1], [x1], #8
1: tbz x2, #1, 1f
ld1 {v15.s}[1], [x1], #4
1: tbz x2, #0, 1f
ld1 {v15.h}[1], [x1], #2
1: uzp1 v14.16b, v15.16b, v15.16b
uzp2 v15.16b, v15.16b, v15.16b
uxtl v12.8h, v14.8b
uxtl v13.8h, v15.8b
br x4
colormatrix_int_ldu3_end:
tbz x2, #2, 1f
ld4 {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4
ld4 {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4
ld4 {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4
ld4 {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4
1: tbz x2, #1, 1f
ld4 {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4
ld4 {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4
1: tbz x2, #0, 1f
ld4 {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4
1: uxtl v12.8h, v12.8b
uxtl v13.8h, v13.8b
uxtl v14.8h, v14.8b
br x4
colormatrix_int_ldu4_end:
tbz x2, #2, 1f
ld4 {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4
ld4 {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4
ld4 {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4
ld4 {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4
1: tbz x2, #1, 1f
ld4 {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4
ld4 {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4
1: tbz x2, #0, 1f
ld4 {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4
1: uxtl v12.8h, v12.8b
uxtl v13.8h, v13.8b
uxtl v14.8h, v14.8b
uxtl v15.8h, v15.8b
br x4
colormatrix_float_stu1_end:
fcvtzs v12.4s, v8.4s, #1
fcvtzs v13.4s, v16.4s, #1
sqrshrun v12.4h, v12.4s, #1
sqrshrun2 v12.8h, v13.4s, #1
uqxtn v12.8b, v12.8h
tbz x2, #2, 1f
st1 {v12.s}[1], [x0], #4
1: tbz x2, #1, 1f
st1 {v12.h}[1], [x0], #2
1: tbz x2, #0, 1f
st1 {v12.b}[1], [x0], #1
1: b colormatrix_float_realend
colormatrix_float_stu2_end:
fcvtzs v12.4s, v8.4s, #1
fcvtzs v13.4s, v9.4s, #1
fcvtzs v14.4s, v16.4s, #1
fcvtzs v15.4s, v17.4s, #1
sqrshrun v12.4h, v12.4s, #1
sqrshrun v13.4h, v13.4s, #1
sqrshrun v14.4h, v14.4s, #1
sqrshrun v15.4h, v15.4s, #1
zip1 v12.8h, v12.8h, v13.8h
zip1 v13.8h, v14.8h, v15.8h
uqxtn v12.8b, v12.8h
uqxtn2 v12.16b, v13.8h
tbz x2, #2, 1f
st1 {v12.d}[1], [x0], #8
1: tbz x2, #1, 1f
st1 {v12.s}[1], [x0], #4
1: tbz x2, #0, 1f
st1 {v12.h}[1], [x0], #2
1: b colormatrix_float_realend
colormatrix_float_stu3_end:
fcvtzs v24.4s, v8.4s, #1
fcvtzs v25.4s, v9.4s, #1
fcvtzs v26.4s, v10.4s, #1
fcvtzs v28.4s, v16.4s, #1
fcvtzs v29.4s, v17.4s, #1
fcvtzs v30.4s, v18.4s, #1
sqrshrun v24.4h, v24.4s, #1
sqrshrun v25.4h, v25.4s, #1
sqrshrun v26.4h, v26.4s, #1
sqrshrun2 v24.8h, v28.4s, #1
sqrshrun2 v25.8h, v29.4s, #1
sqrshrun2 v26.8h, v30.4s, #1
uqxtn v12.8b, v24.8h
uqxtn v13.8b, v25.8h
uqxtn v14.8b, v26.8h
movi v15.8b, #0
tbz x2, #2, 1f
st4 {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
st4 {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
st4 {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
st4 {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
1: tbz x2, #1, 1f
st4 {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
st4 {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
1: tbz x2, #0, 1f
st4 {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
1: b colormatrix_float_realend
colormatrix_float_stu4_end:
fcvtzs v24.4s, v8.4s, #1
fcvtzs v25.4s, v9.4s, #1
fcvtzs v26.4s, v10.4s, #1
fcvtzs v27.4s, v11.4s, #1
fcvtzs v28.4s, v16.4s, #1
fcvtzs v29.4s, v17.4s, #1
fcvtzs v30.4s, v18.4s, #1
fcvtzs v31.4s, v19.4s, #1
sqrshrun v24.4h, v24.4s, #1
sqrshrun v25.4h, v25.4s, #1
sqrshrun v26.4h, v26.4s, #1
sqrshrun v27.4h, v27.4s, #1
sqrshrun2 v24.8h, v28.4s, #1
sqrshrun2 v25.8h, v29.4s, #1
sqrshrun2 v26.8h, v30.4s, #1
sqrshrun2 v27.8h, v31.4s, #1
uqxtn v12.8b, v24.8h
uqxtn v13.8b, v25.8h
uqxtn v14.8b, v26.8h
uqxtn v15.8b, v27.8h
tbz x2, #2, 1f
st4 {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
st4 {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
st4 {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
st4 {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
1: tbz x2, #1, 1f
st4 {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
st4 {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
1: tbz x2, #0, 1f
st4 {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
1: b colormatrix_float_realend
colormatrix_float_stf1_end:
tbz x2, #2, 1f
st1 {v16.4s}, [x0], #16
1: tbz x2, #1, 1f
st1 {v8.d}[1], [x0], #8
1: tbz x2, #0, 1f
st1 {v8.s}[1], [x0], #4
1: b colormatrix_float_realend
colormatrix_float_stf2_end:
tbz x2, #2, 1f
st2 {v16.4s, v17.4s}, [x0], #32
1: tbz x2, #1, 1f
st2 {v8.s,v9.s}[2], [x0], #8
st2 {v8.s,v9.s}[3], [x0], #8
1: tbz x2, #0, 1f
st2 {v8.s,v9.s}[1], [x0], #8
1: b colormatrix_float_realend
colormatrix_float_stf3_end:
movi v11.16b, #0
movi v19.16b, #0
colormatrix_float_stf4_end:
tbz x2, #2, 1f
st4 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
1: tbz x2, #1, 1f
st4 {v8.s,v9.s,v10.s,v11.s}[2], [x0], #16
st4 {v8.s,v9.s,v10.s,v11.s}[3], [x0], #16
1: tbz x2, #0, 1f
st4 {v8.s,v9.s,v10.s,v11.s}[1], [x0], #16
1: b colormatrix_float_realend
colormatrix_float_ldu1_end:
tbz x2, #2, 1f
ld1 {v15.s}[1], [x1], #4
1: tbz x2, #1, 1f
ld1 {v15.h}[1], [x1], #2
1: tbz x2, #0, 1f
ld1 {v15.b}[1], [x1], #1
1: uxtl v15.8h, v15.8b
uxtl v12.4s, v15.4h
uxtl2 v20.4s, v15.8h
ucvtf v12.4s, v12.4s
ucvtf v20.4s, v20.4s
br x4
colormatrix_float_ldu2_end:
tbz x2, #2, 1f
ld1 {v15.d}[1], [x1], #8
1: tbz x2, #1, 1f
ld1 {v15.s}[1], [x1], #4
1: tbz x2, #0, 1f
ld1 {v15.h}[1], [x1], #2
1: uxtl v14.8h, v15.8b
uxtl2 v15.8h, v15.16b
uzp1 v12.8h, v14.8h, v14.8h
uzp2 v13.8h, v14.8h, v14.8h
uzp1 v20.8h, v15.8h, v15.8h
uzp2 v21.8h, v15.8h, v15.8h
uxtl v12.4s, v12.4h
uxtl v13.4s, v13.4h
uxtl v20.4s, v20.4h
uxtl v21.4s, v21.4h
ucvtf v12.4s, v12.4s
ucvtf v13.4s, v13.4s
ucvtf v20.4s, v20.4s
ucvtf v21.4s, v21.4s
br x4
colormatrix_float_ldu3_end:
tbz x2, #2, 1f
ld4 {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4
ld4 {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4
ld4 {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4
ld4 {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4
1: tbz x2, #1, 1f
ld4 {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4
ld4 {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4
1: tbz x2, #0, 1f
ld4 {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4
1: uxtl v20.8h, v20.8b
uxtl v21.8h, v21.8b
uxtl v22.8h, v22.8b
uxtl v12.4s, v20.4h
uxtl v13.4s, v21.4h
uxtl v14.4s, v22.4h
uxtl2 v20.4s, v20.8h
uxtl2 v21.4s, v21.8h
uxtl2 v22.4s, v22.8h
ucvtf v12.4s, v12.4s
ucvtf v13.4s, v13.4s
ucvtf v14.4s, v14.4s
ucvtf v20.4s, v20.4s
ucvtf v21.4s, v21.4s
ucvtf v22.4s, v22.4s
br x4
colormatrix_float_ldu4_end:
tbz x2, #2, 1f
ld4 {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4
ld4 {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4
ld4 {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4
ld4 {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4
1: tbz x2, #1, 1f
ld4 {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4
ld4 {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4
1: tbz x2, #0, 1f
ld4 {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4
1: uxtl v20.8h, v20.8b
uxtl v21.8h, v21.8b
uxtl v22.8h, v22.8b
uxtl v23.8h, v23.8b
uxtl v12.4s, v20.4h
uxtl v13.4s, v21.4h
uxtl v14.4s, v22.4h
uxtl v15.4s, v23.4h
uxtl2 v20.4s, v20.8h
uxtl2 v21.4s, v21.8h
uxtl2 v22.4s, v22.8h
uxtl2 v23.4s, v23.8h
ucvtf v12.4s, v12.4s
ucvtf v13.4s, v13.4s
ucvtf v14.4s, v14.4s
ucvtf v15.4s, v15.4s
ucvtf v20.4s, v20.4s
ucvtf v21.4s, v21.4s
ucvtf v22.4s, v22.4s
ucvtf v23.4s, v23.4s
br x4
colormatrix_float_ldf1_end:
tbz x2, #2, 1f
ld1 {v20.4s}, [x1], #16
1: tbz x2, #1, 1f
ld1 {v12.d}[1], [x1], #8
1: tbz x2, #0, 1f
ld1 {v12.s}[1], [x1], #4
1: br x4
colormatrix_float_ldf2_end:
tbz x2, #2, 1f
ld2 {v20.4s,v21.4s}, [x1], #32
1: tbz x2, #1, 1f
ld2 {v12.s,v13.s}[2], [x1], #8
ld2 {v12.s,v13.s}[3], [x1], #8
1: tbz x2, #0, 1f
ld2 {v12.s,v13.s}[1], [x1], #8
1: br x4
colormatrix_float_ldf3_end:
colormatrix_float_ldf4_end:
tbz x2, #2, 1f
ld4 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
1: tbz x2, #1, 1f
ld4 {v12.s,v13.s,v14.s,v15.s}[2], [x1], #16
ld4 {v12.s,v13.s,v14.s,v15.s}[3], [x1], #16
1: tbz x2, #0, 1f
ld4 {v12.s,v13.s,v14.s,v15.s}[1], [x1], #16
1: br x4
/* void rsdIntrinsicColorMatrix_int_K(
* void *out, // x0
* void const *in, // x1
* size_t count, // x2
* fntab_t const *fns, // x3
* int16_t const *mult, // x4
* int32_t const *add); // x5
*/
ENTRY(rsdIntrinsicColorMatrix_int_K)
sub x7, sp, #32
sub sp, sp, #64
st1 {v8.1d-v11.1d}, [sp]
st1 {v12.1d-v15.1d}, [x7]
ld1 {v0.8h,v1.8h}, [x4], #32
ld1 {v4.4s}, [x5], #16
ldp x4,x5, [x3],#16
ldp x6,x7, [x3],#16
ldp x8,x9, [x3],#16
dup v12.4s, v4.s[0]
dup v13.4s, v4.s[1]
dup v14.4s, v4.s[2]
dup v15.4s, v4.s[3]
sqshrun v8.4h, v12.4s, #8
sqshrun2 v8.8h, v12.4s, #8
sqshrun v9.4h, v13.4s, #8
sqshrun2 v9.8h, v13.4s, #8
sqshrun v10.4h, v14.4s, #8
sqshrun2 v10.8h, v14.4s, #8
sqshrun v11.4h, v15.4s, #8
sqshrun2 v11.8h, v15.4s, #8
subs x2, x2, #8
blo colormatrix_int_end
br x9
colormatrix_int_end:
adds x2, x2, #8
bls colormatrix_int_realend
mov x16, x8
ldp x8, x9, [x3], #16
cmp x4, x16
csel x4, x8, x4, eq
cmp x5, x16
csel x5, x8, x5, eq
cmp x6, x16
csel x6, x8, x6, eq
cmp x7, x16
csel x7, x8, x7, eq
br x9
colormatrix_int_realend:
ld1 {v8.1d-v11.1d}, [sp], #32
ld1 {v12.1d-v15.1d}, [sp], #32
ret
END(rsdIntrinsicColorMatrix_int_K)
/* void rsdIntrinsicColorMatrixSetup_int_K(
* fntab_t const *fns, // x0
* uint32_t mask, // x1
* int dt, // x2
* int st); // x3
*/
ENTRY(rsdIntrinsicColorMatrixSetup_int_K)
adr x7, 2f
add x4, x7, x2, LSL #2
ldrsh x2, [x4], #2
ldrsh x4, [x4]
add x2, x2, x7
add x4, x4, x7
adr x7, 3f
add x5, x7, x3, LSL #2
ldrsh x3, [x5], #2
ldrsh x5, [x5]
add x3, x3, x7
add x5, x5, x7
stp x2, x3, [x0, #32]
stp x4, x5, [x0, #48]
/* For each column function, if the matrix is all zeroes then write NULL,
* otherwise look up the appropriate function and store that. */
mov x3, #4
adr x7, 4f
1: ands x2, x1, #15
beq 9f
and x2, x1, #31
lsl x2, x2, #3
ldrsh x2, [x7, x2]
add x2, x2, x7
9: str x2, [x0], #8
lsr x1, x1, #5
add x7, x7, #2
subs x3, x3, #1
bne 1b
/* For every NULL entry, copy the non-NULL entry that follows it, or the store
* function. */
ldr x2, [x0]
mov x3, #4
1: ldr x1, [x0, #-8]!
cmp x1, #0
csel x2, x1, x2, ne
str x2, [x0]
subs x3, x3, #1
bne 1b
ret
.align 4
2: .hword colormatrix_int_stu1-2b
.hword colormatrix_int_stu1_end-2b
.hword colormatrix_int_stu2-2b
.hword colormatrix_int_stu2_end-2b
.hword colormatrix_int_stu3-2b
.hword colormatrix_int_stu3_end-2b
.hword colormatrix_int_stu4-2b
.hword colormatrix_int_stu4_end-2b
3: .hword colormatrix_int_ldu1-3b
.hword colormatrix_int_ldu1_end-3b
.hword colormatrix_int_ldu2-3b
.hword colormatrix_int_ldu2_end-3b
.hword colormatrix_int_ldu3-3b
.hword colormatrix_int_ldu3_end-3b
.hword colormatrix_int_ldu4-3b
.hword colormatrix_int_ldu4_end-3b
4:
.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
.hword colormatrix_int_col0_\i-4b
.hword colormatrix_int_col1_\i-4b-2
.hword colormatrix_int_col2_\i-4b-4
.hword colormatrix_int_col3_\i-4b-6
.endr
.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
.hword colormatrix_int_col0_n\i-4b
.hword colormatrix_int_col1_n\i-4b-2
.hword colormatrix_int_col2_n\i-4b-4
.hword colormatrix_int_col3_n\i-4b-6
.endr
END(rsdIntrinsicColorMatrixSetup_int_K)
/* void rsdIntrinsicColorMatrix_float_K(
* void *out, // x0
* void const *in, // x1
* size_t count, // x2
* fntab_t const *fns, // x3
* float const *mult, // x4
* float const *add); // x5
*/
ENTRY(rsdIntrinsicColorMatrix_float_K)
sub x7, sp, #32
sub sp, sp, #64
st1 {v8.1d-v11.1d}, [sp]
st1 {v12.1d-v15.1d}, [x7]
ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x4], #64
ld1r {v4.4s}, [x5], #4
ld1r {v5.4s}, [x5], #4
ld1r {v6.4s}, [x5], #4
ld1r {v7.4s}, [x5], #4
ldp x4,x5, [x3], #16
ldp x6,x7, [x3], #16
ldp x8,x9, [x3], #16
mov v8.16b, v4.16b
mov v9.16b, v5.16b
mov v10.16b, v6.16b
mov v11.16b, v7.16b
mov v16.16b, v4.16b
mov v17.16b, v5.16b
mov v18.16b, v6.16b
mov v19.16b, v7.16b
subs x2, x2, #8
blo colormatrix_float_end
br x9
colormatrix_float_end:
adds x2, x2, #8
bls colormatrix_int_realend
mov x16, x8
ldp x8,x9, [x3], #16
cmp x4, x16
csel x4, x8, x4, eq
cmp x5, x16
csel x5, x8, x5, eq
cmp x6, x16
csel x6, x8, x6, eq
cmp x7, x16
csel x7, x8, x7, eq
br x9
colormatrix_float_realend:
ld1 {v8.1d-v11.1d}, [sp], #32
ld1 {v12.1d-v15.1d}, [sp], #32
ret
END(rsdIntrinsicColorMatrix_float_K)
/* void rsdIntrinsicColorMatrixSetup_float_K(
* fntab_t const *fns, // x0
* uint32_t mask, // x1
* int dt, // x2
* int st); // x3
*/
ENTRY(rsdIntrinsicColorMatrixSetup_float_K)
adr x7, 2f
add x4, x7, x2, LSL #2
ldrsh x2, [x4], #2
ldrsh x4, [x4]
add x2, x2, x7
add x4, x4, x7
adr x7, 3f
add x5, x7, x3, LSL #2
ldrsh x3, [x5], #2
ldrsh x5, [x5]
add x3, x3, x7
add x5, x5, x7
stp x2, x3, [x0, #32]
stp x4, x5, [x0, #48]
/* For each column function, if the matrix is all zeroes then write NULL,
* otherwise look up the appropriate function and store that. */
mov x3, #4
adr x7, 4f
1: ands x2, x1, #15
beq 9f
and x2, x1, #31
lsl x2, x2, #3
ldrsh x2, [x7, x2]
add x2, x2, x7
9: str x2, [x0], #8
lsr x1, x1, #5
add x7, x7, #2
subs x3, x3, #1
bne 1b
/* For every NULL entry, copy the non-NULL entry that follows it, or the store
* function. */
ldr x2, [x0]
mov x3, #4
1: ldr x1, [x0, #-8]!
cmp x1, #0
csel x2, x1, x2, ne
str x2, [x0]
subs x3, x3, #1
bne 1b
ret
.align 4
2: .hword colormatrix_float_stu1-2b
.hword colormatrix_float_stu1_end-2b
.hword colormatrix_float_stu2-2b
.hword colormatrix_float_stu2_end-2b
.hword colormatrix_float_stu3-2b
.hword colormatrix_float_stu3_end-2b
.hword colormatrix_float_stu4-2b
.hword colormatrix_float_stu4_end-2b
.hword colormatrix_float_stf1-2b
.hword colormatrix_float_stf1_end-2b
.hword colormatrix_float_stf2-2b
.hword colormatrix_float_stf2_end-2b
.hword colormatrix_float_stf3-2b
.hword colormatrix_float_stf3_end-2b
.hword colormatrix_float_stf4-2b
.hword colormatrix_float_stf4_end-2b
3: .hword colormatrix_float_ldu1-3b
.hword colormatrix_float_ldu1_end-3b
.hword colormatrix_float_ldu2-3b
.hword colormatrix_float_ldu2_end-3b
.hword colormatrix_float_ldu3-3b
.hword colormatrix_float_ldu3_end-3b
.hword colormatrix_float_ldu4-3b
.hword colormatrix_float_ldu4_end-3b
.hword colormatrix_float_ldf1-3b
.hword colormatrix_float_ldf1_end-3b
.hword colormatrix_float_ldf2-3b
.hword colormatrix_float_ldf2_end-3b
.hword colormatrix_float_ldf3-3b
.hword colormatrix_float_ldf3_end-3b
.hword colormatrix_float_ldf4-3b
.hword colormatrix_float_ldf4_end-3b
4:
.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
.hword colormatrix_float_col0_\i-4b
.hword colormatrix_float_col1_\i-4b-2
.hword colormatrix_float_col2_\i-4b-4
.hword colormatrix_float_col3_\i-4b-6
.endr
.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
.hword colormatrix_float_col0_n\i-4b
.hword colormatrix_float_col1_n\i-4b-2
.hword colormatrix_float_col2_n\i-4b-4
.hword colormatrix_float_col3_n\i-4b-6
.endr
END(rsdIntrinsicColorMatrixSetup_float_K)