| /* |
| * Copyright (C) 2012 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include <sys/mman.h> |
| #include <unistd.h> |
| |
| #include "rsCpuIntrinsic.h" |
| #include "rsCpuIntrinsicInlines.h" |
| |
| #include <sys/mman.h> |
| #include <stddef.h> |
| #include <stdint.h> |
| #include <stdlib.h> |
| //#include <utils/StopWatch.h> |
| |
| |
| /* uint kernel |
| * Q0 D0: Load slot for R |
| * D1: Load slot for G |
| * Q1 D2: Load slot for B |
| * D3: Load slot for A |
| * Q2 D4: Matrix |
| * D5: = |
| * Q3 D6: = |
| * D7: = |
| * Q4 D8: Add R |
| * D9: |
| * Q5 D10: Add G |
| * D11: |
| * Q6 D12: Add B |
| * D13: |
| * Q7 D14: Add A |
| * D15: |
| * Q8 D16: I32: R Sum |
| * D17: |
| * Q9 D18: I32: G Sum |
| * D19: |
| * Q10 D20: I32: B Sum |
| * D21: |
| * Q11 D22: I32: A Sum |
| * D23: |
| * Q12 D24: U16: expanded R |
| * D25: |
| * Q13 D26: U16: expanded G |
| * D27: |
| * Q14 D28: U16: expanded B |
| * D29: |
| * Q15 D30: U16: expanded A |
| * D31: |
| * |
| */ |
| |
| /* float kernel |
| * Q0 D0: Load slot for R |
| * D1: = |
| * Q1 D2: Load slot for G |
| * D3: = |
| * Q2 D4: Load slot for B |
| * D5: = |
| * Q3 D6: Load slot for A |
| * D7: = |
| * Q4 D8: Matrix |
| * D9: = |
| * Q5 D10: = |
| * D11: = |
| * Q6 D12: = |
| * D13: = |
| * Q7 D14: = |
| * D15: = |
| * Q8 D16: Add R |
| * D17: = |
| * Q9 D18: Add G |
| * D19: = |
| * Q10 D20: Add B |
| * D21: = |
| * Q11 D22: Add A |
| * D23: = |
| * Q12 D24: Sum R |
| * D25: = |
| * Q13 D26: Sum G |
| * D27: = |
| * Q14 D28: Sum B |
| * D29: = |
| * Q15 D30: Sum A |
| * D31: = |
| * |
| */ |
| |
| |
| |
| namespace android { |
| namespace renderscript { |
| |
| typedef union { |
| uint64_t key; |
| struct { |
| uint32_t inVecSize :2; // [0 - 1] |
| uint32_t outVecSize :2; // [2 - 3] |
| uint32_t inType :4; // [4 - 7] |
| uint32_t outType :4; // [8 - 11] |
| uint32_t dot :1; // [12] |
| uint32_t _unused1 :1; // [13] |
| uint32_t copyAlpha :1; // [14] |
| uint32_t _unused2 :1; // [15] |
| uint32_t coeffMask :16; // [16-31] |
| uint32_t addMask :4; // [32-35] |
| } u; |
| } Key_t; |
| |
| //Re-enable when intrinsic is fixed |
| #if defined(ARCH_ARM64_USE_INTRINSICS) |
| typedef struct { |
| void (*column[4])(void); |
| void (*store)(void); |
| void (*load)(void); |
| void (*store_end)(void); |
| void (*load_end)(void); |
| } FunctionTab_t; |
| |
| extern "C" void rsdIntrinsicColorMatrix_int_K( |
| void *out, void const *in, size_t count, |
| FunctionTab_t const *fns, |
| int16_t const *mult, int32_t const *add); |
| |
| extern "C" void rsdIntrinsicColorMatrix_float_K( |
| void *out, void const *in, size_t count, |
| FunctionTab_t const *fns, |
| float const *mult, float const *add); |
| |
| /* The setup functions fill in function tables to be used by above functions; |
| * this code also eliminates jump-to-another-jump cases by short-circuiting |
| * empty functions. While it's not performance critical, it works out easier |
| * to write the set-up code in assembly than to try to expose the same symbols |
| * and write the code in C. |
| */ |
| extern "C" void rsdIntrinsicColorMatrixSetup_int_K( |
| FunctionTab_t *fns, |
| uint32_t mask, int dt, int st); |
| |
| extern "C" void rsdIntrinsicColorMatrixSetup_float_K( |
| FunctionTab_t *fns, |
| uint32_t mask, int dt, int st); |
| #endif |
| |
| class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic { |
| public: |
| void populateScript(Script *) override; |
| |
| void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override; |
| |
| ~RsdCpuScriptIntrinsicColorMatrix() override; |
| RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e); |
| |
| void preLaunch(uint32_t slot, const Allocation ** ains, |
| uint32_t inLen, Allocation * aout, const void * usr, |
| uint32_t usrLen, const RsScriptCall *sc) override; |
| |
| protected: |
| float fp[16]; |
| float fpa[4]; |
| |
| // The following four fields are read as constants |
| // by the SIMD assembly code. |
| short ip[16]; |
| int ipa[4]; |
| float tmpFp[16]; |
| float tmpFpa[4]; |
| #if defined(ARCH_ARM64_USE_INTRINSICS) |
| FunctionTab_t mFnTab; |
| #endif |
| |
| static void kernel(const RsExpandKernelDriverInfo *info, |
| uint32_t xstart, uint32_t xend, |
| uint32_t outstep); |
| void updateCoeffCache(float fpMul, float addMul); |
| |
| Key_t mLastKey; |
| unsigned char *mBuf; |
| size_t mBufSize; |
| |
| Key_t computeKey(const Element *ein, const Element *eout); |
| |
| bool build(Key_t key); |
| |
| void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count); |
| |
| }; |
| |
| |
| Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey( |
| const Element *ein, const Element *eout) { |
| |
| Key_t key; |
| key.key = 0; |
| |
| // Compute a unique code key for this operation |
| |
| // Add to the key the input and output types |
| bool hasFloat = false; |
| if (ein->getType() == RS_TYPE_FLOAT_32) { |
| hasFloat = true; |
| key.u.inType = RS_TYPE_FLOAT_32; |
| rsAssert(key.u.inType == RS_TYPE_FLOAT_32); |
| } |
| if (eout->getType() == RS_TYPE_FLOAT_32) { |
| hasFloat = true; |
| key.u.outType = RS_TYPE_FLOAT_32; |
| rsAssert(key.u.outType == RS_TYPE_FLOAT_32); |
| } |
| |
| // Mask in the bits indicating which coefficients in the |
| // color matrix are needed. |
| if (hasFloat) { |
| for (uint32_t i=0; i < 16; i++) { |
| if (fabs(fp[i]) != 0.f) { |
| key.u.coeffMask |= 1 << i; |
| } |
| } |
| if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1; |
| if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2; |
| if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4; |
| if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8; |
| |
| } else { |
| for (uint32_t i=0; i < 16; i++) { |
| if (ip[i] != 0) { |
| key.u.coeffMask |= 1 << i; |
| } |
| } |
| if (ipa[0] != 0) key.u.addMask |= 0x1; |
| if (ipa[1] != 0) key.u.addMask |= 0x2; |
| if (ipa[2] != 0) key.u.addMask |= 0x4; |
| if (ipa[3] != 0) key.u.addMask |= 0x8; |
| } |
| |
| // Look for a dot product where the r,g,b colums are the same |
| if ((ip[0] == ip[1]) && (ip[0] == ip[2]) && |
| (ip[4] == ip[5]) && (ip[4] == ip[6]) && |
| (ip[8] == ip[9]) && (ip[8] == ip[10]) && |
| (ip[12] == ip[13]) && (ip[12] == ip[14])) { |
| |
| if (!key.u.addMask) key.u.dot = 1; |
| } |
| |
| // Is alpha a simple copy |
| if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) { |
| key.u.copyAlpha = !(key.u.inType || key.u.outType); |
| } |
| |
| //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key); |
| |
| switch (ein->getVectorSize()) { |
| case 4: |
| key.u.inVecSize = 3; |
| break; |
| case 3: |
| key.u.inVecSize = 2; |
| key.u.coeffMask &= ~0xF000; |
| break; |
| case 2: |
| key.u.inVecSize = 1; |
| key.u.coeffMask &= ~0xFF00; |
| break; |
| default: |
| key.u.coeffMask &= ~0xFFF0; |
| break; |
| } |
| |
| switch (eout->getVectorSize()) { |
| case 4: |
| key.u.outVecSize = 3; |
| break; |
| case 3: |
| key.u.outVecSize = 2; |
| key.u.coeffMask &= ~0x8888; |
| key.u.addMask &= 7; |
| break; |
| case 2: |
| key.u.outVecSize = 1; |
| key.u.coeffMask &= ~0xCCCC; |
| key.u.addMask &= 3; |
| break; |
| default: |
| key.u.coeffMask &= ~0xEEEE; |
| key.u.addMask &= 1; |
| break; |
| } |
| |
| if (key.u.inType && !key.u.outType) { |
| key.u.addMask |= 1; |
| if (key.u.outVecSize > 0) key.u.addMask |= 2; |
| if (key.u.outVecSize > 1) key.u.addMask |= 4; |
| if (key.u.outVecSize > 2) key.u.addMask |= 8; |
| } |
| |
| //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key); |
| return key; |
| } |
| |
| } // namespace renderscript |
| } // namespace android |
| |
| #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS) |
| |
| #define DEF_SYM(x) \ |
| extern "C" uint32_t _N_ColorMatrix_##x; \ |
| extern "C" uint32_t _N_ColorMatrix_##x##_end; \ |
| extern "C" uint32_t _N_ColorMatrix_##x##_len; |
| |
| DEF_SYM(prefix_i) |
| DEF_SYM(prefix_f) |
| DEF_SYM(postfix1) |
| DEF_SYM(postfix2) |
| |
| DEF_SYM(load_u8_4) |
| DEF_SYM(load_u8_3) |
| DEF_SYM(load_u8_2) |
| DEF_SYM(load_u8_1) |
| DEF_SYM(load_u8f_4) |
| DEF_SYM(load_u8f_3) |
| DEF_SYM(load_u8f_2) |
| DEF_SYM(load_u8f_1) |
| DEF_SYM(load_f32_4) |
| DEF_SYM(load_f32_3) |
| DEF_SYM(load_f32_2) |
| DEF_SYM(load_f32_1) |
| |
| DEF_SYM(store_u8_4) |
| DEF_SYM(store_u8_2) |
| DEF_SYM(store_u8_1) |
| DEF_SYM(store_f32_4) |
| DEF_SYM(store_f32_3) |
| DEF_SYM(store_f32_2) |
| DEF_SYM(store_f32_1) |
| DEF_SYM(store_f32u_4) |
| DEF_SYM(store_f32u_2) |
| DEF_SYM(store_f32u_1) |
| |
| DEF_SYM(unpack_u8_4) |
| DEF_SYM(unpack_u8_3) |
| DEF_SYM(unpack_u8_2) |
| DEF_SYM(unpack_u8_1) |
| DEF_SYM(pack_u8_4) |
| DEF_SYM(pack_u8_3) |
| DEF_SYM(pack_u8_2) |
| DEF_SYM(pack_u8_1) |
| DEF_SYM(dot) |
| DEF_SYM(add_0_u8) |
| DEF_SYM(add_1_u8) |
| DEF_SYM(add_2_u8) |
| DEF_SYM(add_3_u8) |
| |
| #define ADD_CHUNK(x) \ |
| memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \ |
| buf += _N_ColorMatrix_##x##_len |
| |
| |
| static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) { |
| size_t off = (target - buf - 8) >> 2; |
| rsAssert(((off & 0xff000000) == 0) || |
| ((off & 0xff000000) == 0xff000000)); |
| |
| uint32_t op = (condition << 28); |
| op |= 0xa << 24; // branch |
| op |= 0xffffff & off; |
| ((uint32_t *)buf)[0] = op; |
| return buf + 4; |
| } |
| |
| static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) { |
| rsAssert(vd < 32); |
| rsAssert(vm < 32); |
| rsAssert(vn < 32); |
| |
| uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22); |
| op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5); |
| op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7); |
| return op; |
| } |
| |
| static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { |
| //vmlal.s16 Q#1, D#1, D#2[#] |
| uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3)); |
| ((uint32_t *)buf)[0] = op; |
| return buf + 4; |
| } |
| |
| static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { |
| //vmull.s16 Q#1, D#1, D#2[#] |
| uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3)); |
| ((uint32_t *)buf)[0] = op; |
| return buf + 4; |
| } |
| |
| static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) { |
| //vqadd.s32 Q#1, Q#1, Q#2 |
| uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1); |
| ((uint32_t *)buf)[0] = op; |
| return buf + 4; |
| } |
| |
| static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { |
| //vmlal.f32 Q#1, D#1, D#2[#] |
| uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4)); |
| ((uint32_t *)buf)[0] = op; |
| return buf + 4; |
| } |
| |
| static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { |
| //vmull.f32 Q#1, D#1, D#2[#] |
| uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4)); |
| ((uint32_t *)buf)[0] = op; |
| return buf + 4; |
| } |
| |
| static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) { |
| //vadd.f32 Q#1, D#1, D#2 |
| uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1); |
| ((uint32_t *)buf)[0] = op; |
| return buf + 4; |
| } |
| |
| static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) { |
| //vmov.32 Q#1, #imm |
| rsAssert(imm == 0); |
| uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0); |
| ((uint32_t *)buf)[0] = op; |
| return buf + 4; |
| } |
| |
| static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) { |
| //vadd.f32 Q#1, D#1, D#2 |
| uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1); |
| ((uint32_t *)buf)[0] = op; |
| return buf + 4; |
| } |
| #endif |
| |
| #if defined(ARCH_X86_HAVE_SSSE3) |
| extern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src, |
| const short *coef, uint32_t count); |
| extern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src, |
| const short *coef, uint32_t count); |
| extern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src, |
| const short *coef, uint32_t count); |
| |
| using android::renderscript::Key_t; |
| |
| void * selectKernel(Key_t key) |
| { |
| void * kernel = nullptr; |
| |
| // inType, outType float if nonzero |
| if (!(key.u.inType || key.u.outType)) { |
| if (key.u.dot) |
| kernel = (void *)rsdIntrinsicColorMatrixDot_K; |
| else if (key.u.copyAlpha) |
| kernel = (void *)rsdIntrinsicColorMatrix3x3_K; |
| else |
| kernel = (void *)rsdIntrinsicColorMatrix4x4_K; |
| } |
| |
| return kernel; |
| } |
| #endif |
| |
| namespace android { |
| namespace renderscript { |
| |
| bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) { |
| #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS) |
| mBufSize = 4096; |
| //StopWatch build_time("rs cm: build time"); |
| mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE, |
| MAP_PRIVATE | MAP_ANON, -1, 0); |
| if (mBuf == MAP_FAILED) { |
| mBuf = NULL; |
| return false; |
| } |
| |
| uint8_t *buf = mBuf; |
| uint8_t *buf2 = nullptr; |
| |
| int ops[5][4]; // 0=unused, 1 = set, 2 = accumulate, 3 = final |
| int opInit[4] = {0, 0, 0, 0}; |
| |
| memset(ops, 0, sizeof(ops)); |
| for (int i=0; i < 4; i++) { |
| if (key.u.coeffMask & (1 << (i*4))) { |
| ops[i][0] = 0x2 | opInit[0]; |
| opInit[0] = 1; |
| } |
| if (!key.u.dot) { |
| if (key.u.coeffMask & (1 << (1 + i*4))) { |
| ops[i][1] = 0x2 | opInit[1]; |
| opInit[1] = 1; |
| } |
| if (key.u.coeffMask & (1 << (2 + i*4))) { |
| ops[i][2] = 0x2 | opInit[2]; |
| opInit[2] = 1; |
| } |
| } |
| if (!key.u.copyAlpha) { |
| if (key.u.coeffMask & (1 << (3 + i*4))) { |
| ops[i][3] = 0x2 | opInit[3]; |
| opInit[3] = 1; |
| } |
| } |
| } |
| |
| if (key.u.inType || key.u.outType) { |
| key.u.copyAlpha = 0; |
| ADD_CHUNK(prefix_f); |
| buf2 = buf; |
| |
| // Load the incoming r,g,b,a as needed |
| if (key.u.inType) { |
| switch(key.u.inVecSize) { |
| case 3: |
| ADD_CHUNK(load_f32_4); |
| break; |
| case 2: |
| ADD_CHUNK(load_f32_3); |
| break; |
| case 1: |
| ADD_CHUNK(load_f32_2); |
| break; |
| case 0: |
| ADD_CHUNK(load_f32_1); |
| break; |
| } |
| } else { |
| switch(key.u.inVecSize) { |
| case 3: |
| ADD_CHUNK(load_u8f_4); |
| break; |
| case 2: |
| ADD_CHUNK(load_u8f_3); |
| break; |
| case 1: |
| ADD_CHUNK(load_u8f_2); |
| break; |
| case 0: |
| ADD_CHUNK(load_u8f_1); |
| break; |
| } |
| } |
| |
| for (int i=0; i < 4; i++) { |
| for (int j=0; j < 4; j++) { |
| switch(ops[i][j]) { |
| case 0: |
| break; |
| case 2: |
| buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1); |
| break; |
| case 3: |
| buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1); |
| break; |
| } |
| } |
| } |
| for (int j=0; j < 4; j++) { |
| if (opInit[j]) { |
| if (key.u.addMask & (1 << j)) { |
| buf = addVADD_F32(buf, j, 12+j, 8+j); |
| } else { |
| buf = addVORR_32(buf, j, 12+j, 12+j); |
| } |
| } else { |
| if (key.u.addMask & (1 << j)) { |
| buf = addVORR_32(buf, j, 8+j, 8+j); |
| } else { |
| buf = addVMOV_32(buf, j, 0); |
| } |
| } |
| } |
| |
| if (key.u.outType) { |
| switch(key.u.outVecSize) { |
| case 3: |
| ADD_CHUNK(store_f32_4); |
| break; |
| case 2: |
| ADD_CHUNK(store_f32_3); |
| break; |
| case 1: |
| ADD_CHUNK(store_f32_2); |
| break; |
| case 0: |
| ADD_CHUNK(store_f32_1); |
| break; |
| } |
| } else { |
| switch(key.u.outVecSize) { |
| case 3: |
| case 2: |
| ADD_CHUNK(store_f32u_4); |
| break; |
| case 1: |
| ADD_CHUNK(store_f32u_2); |
| break; |
| case 0: |
| ADD_CHUNK(store_f32u_1); |
| break; |
| } |
| } |
| |
| |
| } else { |
| // Add the function prefix |
| // Store the address for the loop return |
| ADD_CHUNK(prefix_i); |
| buf2 = buf; |
| |
| // Load the incoming r,g,b,a as needed |
| switch(key.u.inVecSize) { |
| case 3: |
| ADD_CHUNK(load_u8_4); |
| if (key.u.copyAlpha) { |
| ADD_CHUNK(unpack_u8_3); |
| } else { |
| ADD_CHUNK(unpack_u8_4); |
| } |
| break; |
| case 2: |
| ADD_CHUNK(load_u8_3); |
| ADD_CHUNK(unpack_u8_3); |
| break; |
| case 1: |
| ADD_CHUNK(load_u8_2); |
| ADD_CHUNK(unpack_u8_2); |
| break; |
| case 0: |
| ADD_CHUNK(load_u8_1); |
| ADD_CHUNK(unpack_u8_1); |
| break; |
| } |
| |
| // Add multiply and accumulate |
| // use MULL to init the output register, |
| // use MLAL from there |
| for (int i=0; i < 4; i++) { |
| for (int j=0; j < 4; j++) { |
| switch(ops[i][j]) { |
| case 0: |
| break; |
| case 2: |
| buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j); |
| break; |
| case 3: |
| buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j); |
| break; |
| } |
| } |
| } |
| for (int j=0; j < 4; j++) { |
| if (opInit[j]) { |
| if (key.u.addMask & (1 << j)) { |
| buf = addVQADD_S32(buf, 8+j, 8+j, 4+j); |
| } |
| } else { |
| if (key.u.addMask & (1 << j)) { |
| buf = addVORR_32(buf, 8+j, 4+j, 4+j); |
| } |
| } |
| } |
| |
| // If we have a dot product, perform the special pack. |
| if (key.u.dot) { |
| ADD_CHUNK(pack_u8_1); |
| ADD_CHUNK(dot); |
| } else { |
| switch(key.u.outVecSize) { |
| case 3: |
| if (key.u.copyAlpha) { |
| ADD_CHUNK(pack_u8_3); |
| } else { |
| ADD_CHUNK(pack_u8_4); |
| } |
| break; |
| case 2: |
| ADD_CHUNK(pack_u8_3); |
| break; |
| case 1: |
| ADD_CHUNK(pack_u8_2); |
| break; |
| case 0: |
| ADD_CHUNK(pack_u8_1); |
| break; |
| } |
| } |
| |
| // Write out result |
| switch(key.u.outVecSize) { |
| case 3: |
| case 2: |
| ADD_CHUNK(store_u8_4); |
| break; |
| case 1: |
| ADD_CHUNK(store_u8_2); |
| break; |
| case 0: |
| ADD_CHUNK(store_u8_1); |
| break; |
| } |
| } |
| |
| if (key.u.inType != key.u.outType) { |
| key.u.copyAlpha = 0; |
| key.u.dot = 0; |
| } |
| |
| // Loop, branch, and cleanup |
| ADD_CHUNK(postfix1); |
| buf = addBranch(buf, buf2, 0x01); |
| ADD_CHUNK(postfix2); |
| |
| int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC); |
| if (ret == -1) { |
| ALOGE("mprotect error %i", ret); |
| return false; |
| } |
| |
| __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize); |
| return true; |
| #else |
| return false; |
| #endif |
| } |
| |
| void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) { |
| for(int ct=0; ct < 16; ct++) { |
| ip[ct] = (short)(fp[ct] * 256.f + 0.5f); |
| tmpFp[ct] = fp[ct] * fpMul; |
| //ALOGE("mat %i %f %f", ct, fp[ct], tmpFp[ct]); |
| } |
| |
| float add = 0.f; |
| if (fpMul > 254.f) add = 0.5f; |
| for(int ct=0; ct < 4; ct++) { |
| tmpFpa[ct] = fpa[ct] * addMul + add; |
| //ALOGE("fpa %i %f %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]); |
| } |
| |
| for(int ct=0; ct < 4; ct++) { |
| ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f); |
| } |
| } |
| |
| void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data, |
| size_t dataLength) { |
| switch(slot) { |
| case 0: |
| memcpy (fp, data, sizeof(fp)); |
| break; |
| case 1: |
| memcpy (fpa, data, sizeof(fpa)); |
| break; |
| default: |
| rsAssert(0); |
| break; |
| } |
| mRootPtr = &kernel; |
| } |
| |
| |
| static void One(const RsExpandKernelDriverInfo *info, void *out, |
| const void *py, const float* coeff, const float *add, |
| uint32_t vsin, uint32_t vsout, bool fin, bool fout) { |
| |
| float4 f = 0.f; |
| if (fin) { |
| switch(vsin) { |
| case 3: |
| f = ((const float4 *)py)[0]; |
| break; |
| case 2: |
| f = ((const float4 *)py)[0]; |
| f.w = 0.f; |
| break; |
| case 1: |
| f.xy = ((const float2 *)py)[0]; |
| break; |
| case 0: |
| f.x = ((const float *)py)[0]; |
| break; |
| } |
| } else { |
| switch(vsin) { |
| case 3: |
| f = convert_float4(((const uchar4 *)py)[0]); |
| break; |
| case 2: |
| f = convert_float4(((const uchar4 *)py)[0]); |
| f.w = 0.f; |
| break; |
| case 1: |
| f.xy = convert_float2(((const uchar2 *)py)[0]); |
| break; |
| case 0: |
| f.x = (float)(((const uchar *)py)[0]); |
| break; |
| } |
| } |
| //ALOGE("f1 %f %f %f %f", f.x, f.y, f.z, f.w); |
| |
| float4 sum; |
| sum.x = f.x * coeff[0] + |
| f.y * coeff[4] + |
| f.z * coeff[8] + |
| f.w * coeff[12]; |
| sum.y = f.x * coeff[1] + |
| f.y * coeff[5] + |
| f.z * coeff[9] + |
| f.w * coeff[13]; |
| sum.z = f.x * coeff[2] + |
| f.y * coeff[6] + |
| f.z * coeff[10] + |
| f.w * coeff[14]; |
| sum.w = f.x * coeff[3] + |
| f.y * coeff[7] + |
| f.z * coeff[11] + |
| f.w * coeff[15]; |
| //ALOGE("f2 %f %f %f %f", sum.x, sum.y, sum.z, sum.w); |
| |
| sum.x += add[0]; |
| sum.y += add[1]; |
| sum.z += add[2]; |
| sum.w += add[3]; |
| |
| |
| //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w); |
| if (fout) { |
| switch(vsout) { |
| case 3: |
| case 2: |
| ((float4 *)out)[0] = sum; |
| break; |
| case 1: |
| ((float2 *)out)[0] = sum.xy; |
| break; |
| case 0: |
| ((float *)out)[0] = sum.x; |
| break; |
| } |
| } else { |
| sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x); |
| sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y); |
| sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z); |
| sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w); |
| |
| switch(vsout) { |
| case 3: |
| case 2: |
| ((uchar4 *)out)[0] = convert_uchar4(sum); |
| break; |
| case 1: |
| ((uchar2 *)out)[0] = convert_uchar2(sum.xy); |
| break; |
| case 0: |
| ((uchar *)out)[0] = sum.x; |
| break; |
| } |
| } |
| //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]); |
| } |
| |
| void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelDriverInfo *info, |
| uint32_t xstart, uint32_t xend, |
| uint32_t outstep) { |
| RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)info->usr; |
| |
| uint32_t instep = info->inStride[0]; |
| |
| uchar *out = (uchar *)info->outPtr[0]; |
| uchar *in = (uchar *)info->inPtr[0]; |
| uint32_t x1 = xstart; |
| uint32_t x2 = xend; |
| |
| uint32_t vsin = cp->mLastKey.u.inVecSize; |
| uint32_t vsout = cp->mLastKey.u.outVecSize; |
| bool floatIn = !!cp->mLastKey.u.inType; |
| bool floatOut = !!cp->mLastKey.u.outType; |
| |
| //if (!info->current.y) ALOGE("steps %i %i %i %i", instep, outstep, vsin, vsout); |
| |
| if(x2 > x1) { |
| int32_t len = x2 - x1; |
| if (gArchUseSIMD) { |
| if((cp->mOptKernel != nullptr) && (len >= 4)) { |
| // The optimized kernel processes 4 pixels at once |
| // and requires a minimum of 1 chunk of 4 |
| cp->mOptKernel(out, in, cp->ip, len >> 2); |
| // Update the len and pointers so the generic code can |
| // finish any leftover pixels |
| len &= ~3; |
| x1 += len; |
| out += outstep * len; |
| in += instep * len; |
| } |
| #if defined(ARCH_ARM64_USE_INTRINSICS) |
| else { |
| if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) { |
| // Currently this generates off by one errors. |
| //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa); |
| //x1 += len; |
| //out += outstep * len; |
| //in += instep * len; |
| } else { |
| rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa); |
| x1 += len; |
| out += outstep * len; |
| in += instep * len; |
| } |
| } |
| #endif |
| } |
| |
| while(x1 != x2) { |
| One(info, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut); |
| out += outstep; |
| in += instep; |
| x1++; |
| } |
| } |
| } |
| |
| void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot, |
| const Allocation ** ains, |
| uint32_t inLen, |
| Allocation * aout, |
| const void * usr, |
| uint32_t usrLen, |
| const RsScriptCall *sc) { |
| |
| const Element *ein = ains[0]->mHal.state.type->getElement(); |
| const Element *eout = aout->mHal.state.type->getElement(); |
| |
| if (ein->getType() == eout->getType()) { |
| if (eout->getType() == RS_TYPE_UNSIGNED_8) { |
| updateCoeffCache(1.f, 255.f); |
| } else { |
| updateCoeffCache(1.f, 1.f); |
| } |
| } else { |
| if (eout->getType() == RS_TYPE_UNSIGNED_8) { |
| updateCoeffCache(255.f, 255.f); |
| } else { |
| updateCoeffCache(1.f / 255.f, 1.f); |
| } |
| } |
| |
| Key_t key = computeKey(ein, eout); |
| |
| #if defined(ARCH_X86_HAVE_SSSE3) |
| if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) { |
| // FIXME: Disable mOptKernel to pass RS color matrix CTS cases |
| // mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) selectKernel(key); |
| mLastKey = key; |
| } |
| |
| #else //if !defined(ARCH_X86_HAVE_SSSE3) |
| if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) { |
| if (mBuf) munmap(mBuf, mBufSize); |
| mBuf = nullptr; |
| mOptKernel = nullptr; |
| if (build(key)) { |
| mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf; |
| } |
| #if defined(ARCH_ARM64_USE_INTRINSICS) |
| else { |
| int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0); |
| int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0); |
| uint32_t mm = 0; |
| int i; |
| for (i = 0; i < 4; i++) |
| { |
| uint32_t m = (key.u.coeffMask >> i) & 0x1111; |
| m = ((m * 0x249) >> 9) & 15; |
| m |= ((key.u.addMask >> i) & 1) << 4; |
| mm |= m << (i * 5); |
| } |
| |
| if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) { |
| rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st); |
| } else { |
| rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st); |
| } |
| } |
| #endif |
| mLastKey = key; |
| } |
| #endif //if !defined(ARCH_X86_HAVE_SSSE3) |
| } |
| |
| RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix( |
| RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) |
| : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) { |
| |
| mLastKey.key = 0; |
| mBuf = nullptr; |
| mBufSize = 0; |
| mOptKernel = nullptr; |
| const static float defaultMatrix[] = { |
| 1.f, 0.f, 0.f, 0.f, |
| 0.f, 1.f, 0.f, 0.f, |
| 0.f, 0.f, 1.f, 0.f, |
| 0.f, 0.f, 0.f, 1.f |
| }; |
| const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f}; |
| setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix)); |
| setGlobalVar(1, defaultAdd, sizeof(defaultAdd)); |
| } |
| |
| RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() { |
| if (mBuf) munmap(mBuf, mBufSize); |
| mBuf = nullptr; |
| mOptKernel = nullptr; |
| } |
| |
| void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) { |
| s->mHal.info.exportedVariableCount = 2; |
| } |
| |
| RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx, |
| const Script *s, const Element *e) { |
| |
| return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e); |
| } |
| |
| } // namespace renderscript |
| } // namespace android |