blob: b95e07fa50962bb87c64543eff313e275cb8045b [file] [log] [blame]
//******************************************************************************
//*
//* Copyright (C) 2015 The Android Open Source Project
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
//******************************************************************************
//*
//* @brief
//* This file contains definitions of routines for spatial filter
//*
//* @author
//* Ittiam
//*
//* @par List of Functions:
//* - ideint_spatial_filter_a9()
//*
//* @remarks
//* None
//*
//*******************************************************************************
//******************************************************************************
//*
//* @brief Performs spatial filtering
//*
//* @par Description
//* This functions performs edge adaptive spatial filtering on a 8x8 block
//*
//* @param[in] pu1_src
//* UWORD8 pointer to the source
//*
//* @param[in] pu1_out
//* UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//* source stride
//*
//* @param[in] src_strd
//* destination stride
//*
//* @returns
//* None
//*
//* @remarks
//*
//******************************************************************************
.global ideint_spatial_filter_av8
ideint_spatial_filter_av8:
movi v16.8h, #0
movi v18.8h, #0
movi v20.8h, #0
// Backup x0
mov x10, x0
// Load from &pu1_row_1[0]
sub x5, x0, #1
ld1 {v0.8b}, [x0], x2
// Load from &pu1_row_1[-1]
ld1 {v1.8b}, [x5]
add x5, x5, #2
// Load from &pu1_row_1[1]
ld1 {v2.8b}, [x5]
// Number of rows
mov x4, #4
// EDGE_BIAS_0
movi v30.2s, #5
// EDGE_BIAS_1
movi v31.2s, #7
detect_edge:
// Load from &pu1_row_2[0]
sub x5, x0, #1
ld1 {v3.8b}, [x0], x2
// Load from &pu1_row_2[-1]
ld1 {v4.8b}, [x5]
add x5, x5, #2
// Load from &pu1_row_2[1]
ld1 {v5.8b}, [x5]
// Calculate absolute differences
// pu1_row_1[i] - pu1_row_2[i]
uabal v16.8h, v0.8b, v3.8b
// pu1_row_1[i - 1] - pu1_row_2[i + 1]
uabal v18.8h, v1.8b, v5.8b
// pu1_row_1[i + 1] - pu1_row_2[i - 1]
uabal v20.8h, v2.8b, v4.8b
mov v0.8b, v3.8b
mov v1.8b, v4.8b
mov v2.8b, v5.8b
subs x4, x4, #1
bgt detect_edge
// Calculate sum of absolute differeces for each edge
addp v16.8h, v16.8h, v16.8h
addp v18.8h, v18.8h, v18.8h
addp v20.8h, v20.8h, v20.8h
uaddlp v16.2s, v16.4h
uaddlp v18.2s, v18.4h
uaddlp v20.2s, v20.4h
// adiff[0] *= EDGE_BIAS_0;
mul v16.2s, v16.2s, v30.2s
// adiff[1] *= EDGE_BIAS_1;
mul v18.2s, v18.2s, v31.2s
// adiff[2] *= EDGE_BIAS_1;
mul v20.2s, v20.2s, v31.2s
// Move the differences to ARM registers
// Compute shift for first half of the block
compute_shift_1:
smov x5, v16.2s[0]
smov x6, v18.2s[0]
smov x7, v20.2s[0]
// Compute shift
mov x8, #0
// adiff[2] <= adiff[1]
cmp x7, x6
bgt dir_45_gt_135_1
// adiff[2] <= adiff[0]
cmp x7, x5
mov x11, #1
csel x8, x11, x8, le
b compute_shift_2
dir_45_gt_135_1:
// adiff[1] <= adiff[0]
cmp x6, x5
// Move -1 if less than or equal to
movn x11, #0
csel x8, x11, x8, le
compute_shift_2:
// Compute shift for first half of the block
smov x5, v16.2s[1]
smov x6, v18.2s[1]
smov x7, v20.2s[1]
// Compute shift
mov x9, #0
// adiff[2] <= adiff[1]
cmp x7, x6
bgt dir_45_gt_135_2
// adiff[2] <= adiff[0]
cmp x7, x5
mov x11, #1
csel x9, x11, x9, le
b interpolate
dir_45_gt_135_2:
// adiff[1] <= adiff[0]
cmp x6, x5
// Move -1 if less than or equal to
movn x11, #0
csel x9, x11, x9, le
interpolate:
add x4, x10, x8
add x5, x10, x2
sub x5, x5, x8
add x10, x10, #4
add x6, x10, x9
add x7, x10, x2
sub x7, x7, x9
mov x8, #4
filter_loop:
ld1 {v0.s}[0], [x4], x2
ld1 {v2.s}[0], [x5], x2
ld1 {v0.s}[1], [x6], x2
ld1 {v2.s}[1], [x7], x2
urhadd v4.8b, v0.8b, v2.8b
st1 {v4.2s}, [x1], x3
subs x8, x8, #1
bgt filter_loop
ret