common/armv8/ideint_spatial_filter_av8.s - platform/external/libmpeg2 - Git at Google

 //******************************************************************************
 //*
 //* Copyright (C) 2015 The Android Open Source Project
 //*
 //* Licensed under the Apache License, Version 2.0 (the "License");
 //* you may not use this file except in compliance with the License.
 //* You may obtain a copy of the License at:
 //*
 //* http://www.apache.org/licenses/LICENSE-2.0
 //*
 //* Unless required by applicable law or agreed to in writing, software
 //* distributed under the License is distributed on an "AS IS" BASIS,
 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 //* See the License for the specific language governing permissions and
 //* limitations under the License.
 //*
 //*****************************************************************************
 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 //*/

 //******************************************************************************
 //*
 //* @brief
 //*  This file contains definitions of routines for spatial filter
 //*
 //* @author
 //*  Ittiam
 //*
 //* @par List of Functions:
 //*  - ideint_spatial_filter_a9()
 //*
 //* @remarks
 //*  None
 //*
 //*******************************************************************************


 //******************************************************************************
 //*
 //*  @brief Performs spatial filtering
 //*
 //*  @par   Description
 //*   This functions performs edge adaptive spatial filtering on a 8x8 block
 //*
 //* @param[in] pu1_src
 //*  UWORD8 pointer to the source
 //*
 //* @param[in] pu1_out
 //*  UWORD8 pointer to the destination
 //*
 //* @param[in] src_strd
 //*  source stride
 //*
 //* @param[in] src_strd
 //*  destination stride
 //*
 //* @returns
 //*     None
 //*
 //* @remarks
 //*
 //******************************************************************************

     .global ideint_spatial_filter_av8

 ideint_spatial_filter_av8:

     movi  v16.8h, #0
     movi  v18.8h, #0
     movi  v20.8h, #0

     // Backup x0
     mov     x10,    x0

     // Load from &pu1_row_1[0]
     sub     x5,         x0,         #1
     ld1     {v0.8b},    [x0],       x2

     // Load from &pu1_row_1[-1]
     ld1     {v1.8b},    [x5]
     add     x5,         x5,        #2

     // Load from &pu1_row_1[1]
     ld1     {v2.8b},    [x5]

     // Number of rows
     mov     x4,         #4

     // EDGE_BIAS_0
     movi    v30.2s,     #5

     // EDGE_BIAS_1
     movi    v31.2s,     #7

 detect_edge:
     // Load from &pu1_row_2[0]
     sub     x5,         x0,         #1
     ld1     {v3.8b},    [x0],       x2

     // Load from &pu1_row_2[-1]
     ld1     {v4.8b},    [x5]
     add     x5,         x5,         #2

     // Load from &pu1_row_2[1]
     ld1     {v5.8b},    [x5]

     // Calculate absolute differences
     // pu1_row_1[i] - pu1_row_2[i]
     uabal   v16.8h,      v0.8b,        v3.8b

     // pu1_row_1[i - 1] - pu1_row_2[i + 1]
     uabal   v18.8h,      v1.8b,        v5.8b

     // pu1_row_1[i + 1] - pu1_row_2[i - 1]
     uabal   v20.8h,      v2.8b,        v4.8b

     mov     v0.8b,      v3.8b
     mov     v1.8b,      v4.8b
     mov     v2.8b,      v5.8b

     subs    x4,         x4,             #1
     bgt            detect_edge

     // Calculate sum of absolute differeces for each edge
     addp  v16.8h,       v16.8h,         v16.8h
     addp  v18.8h,       v18.8h,         v18.8h
     addp  v20.8h,       v20.8h,         v20.8h

     uaddlp  v16.2s,     v16.4h
     uaddlp  v18.2s,     v18.4h
     uaddlp  v20.2s,     v20.4h

     // adiff[0] *= EDGE_BIAS_0;
     mul     v16.2s,     v16.2s,         v30.2s

     // adiff[1] *= EDGE_BIAS_1;
     mul     v18.2s,     v18.2s,         v31.2s

     // adiff[2] *= EDGE_BIAS_1;
     mul     v20.2s,     v20.2s,         v31.2s

     // Move the differences to ARM registers


     // Compute shift for first half of the block
 compute_shift_1:
     smov    x5,         v16.2s[0]
     smov    x6,         v18.2s[0]
     smov    x7,         v20.2s[0]

     // Compute shift
     mov     x8,         #0

     // adiff[2] <= adiff[1]
     cmp     x7,         x6
     bgt     dir_45_gt_135_1

     // adiff[2] <= adiff[0]
     cmp     x7,         x5
     mov     x11,        #1
     csel    x8,         x11,        x8,     le

     b       compute_shift_2
 dir_45_gt_135_1:

     // adiff[1] <= adiff[0]
     cmp     x6,         x5
     // Move -1 if less than or equal to
     movn    x11,        #0
     csel    x8,         x11,        x8,     le


 compute_shift_2:
     // Compute shift for first half of the block
     smov    x5,         v16.2s[1]
     smov    x6,         v18.2s[1]
     smov    x7,         v20.2s[1]

     // Compute shift
     mov     x9,         #0

     // adiff[2] <= adiff[1]
     cmp     x7,         x6
     bgt     dir_45_gt_135_2

     // adiff[2] <= adiff[0]
     cmp     x7,         x5
     mov     x11,        #1
     csel    x9,         x11,        x9,     le

     b       interpolate

 dir_45_gt_135_2:
     // adiff[1] <= adiff[0]
     cmp     x6,         x5

     // Move -1 if less than or equal to
     movn    x11,        #0
     csel    x9,         x11,        x9,     le

 interpolate:
     add     x4,         x10,        x8
     add     x5,         x10,        x2
     sub     x5,         x5,         x8

     add     x10,        x10,        #4
     add     x6,         x10,        x9
     add     x7,         x10,        x2
     sub     x7,         x7,         x9
     mov     x8,         #4

 filter_loop:
     ld1     {v0.s}[0],  [x4],       x2
     ld1     {v2.s}[0],  [x5],       x2

     ld1     {v0.s}[1],  [x6],       x2
     ld1     {v2.s}[1],  [x7],       x2

     urhadd  v4.8b,      v0.8b,      v2.8b
     st1     {v4.2s},    [x1],       x3

     subs    x8,         x8,         #1
     bgt     filter_loop

     ret
	//******************************************************************************
	//*
	//* Copyright (C) 2015 The Android Open Source Project
	//*
	//* Licensed under the Apache License, Version 2.0 (the "License");
	//* you may not use this file except in compliance with the License.
	//* You may obtain a copy of the License at:
	//*
	//* http://www.apache.org/licenses/LICENSE-2.0
	//*
	//* Unless required by applicable law or agreed to in writing, software
	//* distributed under the License is distributed on an "AS IS" BASIS,
	//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	//* See the License for the specific language governing permissions and
	//* limitations under the License.
	//*
	//*****************************************************************************
	//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
	//*/

	//******************************************************************************
	//*
	//* @brief
	//* This file contains definitions of routines for spatial filter
	//*
	//* @author
	//* Ittiam
	//*
	//* @par List of Functions:
	//* - ideint_spatial_filter_a9()
	//*
	//* @remarks
	//* None
	//*
	//*******************************************************************************


	//******************************************************************************
	//*
	//* @brief Performs spatial filtering
	//*
	//* @par Description
	//* This functions performs edge adaptive spatial filtering on a 8x8 block
	//*
	//* @param[in] pu1_src
	//* UWORD8 pointer to the source
	//*
	//* @param[in] pu1_out
	//* UWORD8 pointer to the destination
	//*
	//* @param[in] src_strd
	//* source stride
	//*
	//* @param[in] src_strd
	//* destination stride
	//*
	//* @returns
	//* None
	//*
	//* @remarks
	//*
	//******************************************************************************

	.global ideint_spatial_filter_av8

	ideint_spatial_filter_av8:

	movi v16.8h, #0
	movi v18.8h, #0
	movi v20.8h, #0

	// Backup x0
	mov x10, x0

	// Load from &pu1_row_1[0]
	sub x5, x0, #1
	ld1 {v0.8b}, [x0], x2

	// Load from &pu1_row_1[-1]
	ld1 {v1.8b}, [x5]
	add x5, x5, #2

	// Load from &pu1_row_1[1]
	ld1 {v2.8b}, [x5]

	// Number of rows
	mov x4, #4

	// EDGE_BIAS_0
	movi v30.2s, #5

	// EDGE_BIAS_1
	movi v31.2s, #7

	detect_edge:
	// Load from &pu1_row_2[0]
	sub x5, x0, #1
	ld1 {v3.8b}, [x0], x2

	// Load from &pu1_row_2[-1]
	ld1 {v4.8b}, [x5]
	add x5, x5, #2

	// Load from &pu1_row_2[1]
	ld1 {v5.8b}, [x5]

	// Calculate absolute differences
	// pu1_row_1[i] - pu1_row_2[i]
	uabal v16.8h, v0.8b, v3.8b

	// pu1_row_1[i - 1] - pu1_row_2[i + 1]
	uabal v18.8h, v1.8b, v5.8b

	// pu1_row_1[i + 1] - pu1_row_2[i - 1]
	uabal v20.8h, v2.8b, v4.8b

	mov v0.8b, v3.8b
	mov v1.8b, v4.8b
	mov v2.8b, v5.8b

	subs x4, x4, #1
	bgt detect_edge

	// Calculate sum of absolute differeces for each edge
	addp v16.8h, v16.8h, v16.8h
	addp v18.8h, v18.8h, v18.8h
	addp v20.8h, v20.8h, v20.8h

	uaddlp v16.2s, v16.4h
	uaddlp v18.2s, v18.4h
	uaddlp v20.2s, v20.4h

	// adiff[0] *= EDGE_BIAS_0;
	mul v16.2s, v16.2s, v30.2s

	// adiff[1] *= EDGE_BIAS_1;
	mul v18.2s, v18.2s, v31.2s

	// adiff[2] *= EDGE_BIAS_1;
	mul v20.2s, v20.2s, v31.2s

	// Move the differences to ARM registers


	// Compute shift for first half of the block
	compute_shift_1:
	smov x5, v16.2s[0]
	smov x6, v18.2s[0]
	smov x7, v20.2s[0]

	// Compute shift
	mov x8, #0

	// adiff[2] <= adiff[1]
	cmp x7, x6
	bgt dir_45_gt_135_1

	// adiff[2] <= adiff[0]
	cmp x7, x5
	mov x11, #1
	csel x8, x11, x8, le

	b compute_shift_2
	dir_45_gt_135_1:

	// adiff[1] <= adiff[0]
	cmp x6, x5
	// Move -1 if less than or equal to
	movn x11, #0
	csel x8, x11, x8, le


	compute_shift_2:
	// Compute shift for first half of the block
	smov x5, v16.2s[1]
	smov x6, v18.2s[1]
	smov x7, v20.2s[1]

	// Compute shift
	mov x9, #0

	// adiff[2] <= adiff[1]
	cmp x7, x6
	bgt dir_45_gt_135_2

	// adiff[2] <= adiff[0]
	cmp x7, x5
	mov x11, #1
	csel x9, x11, x9, le

	b interpolate

	dir_45_gt_135_2:
	// adiff[1] <= adiff[0]
	cmp x6, x5

	// Move -1 if less than or equal to
	movn x11, #0
	csel x9, x11, x9, le

	interpolate:
	add x4, x10, x8
	add x5, x10, x2
	sub x5, x5, x8

	add x10, x10, #4
	add x6, x10, x9
	add x7, x10, x2
	sub x7, x7, x9
	mov x8, #4

	filter_loop:
	ld1 {v0.s}[0], [x4], x2
	ld1 {v2.s}[0], [x5], x2

	ld1 {v0.s}[1], [x6], x2
	ld1 {v2.s}[1], [x7], x2

	urhadd v4.8b, v0.8b, v2.8b
	st1 {v4.2s}, [x1], x3

	subs x8, x8, #1
	bgt filter_loop

	ret