Source/platform/graphics/cpu/arm/filters/FECompositeArithmeticNEON.h - platform/external/chromium_org/third_party/WebKit - Git at Google

 /*
  * Copyright (C) 2011 University of Szeged
  * Copyright (C) 2011 Felician Marton
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL UNIVERSITY OF SZEGED OR
  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #ifndef FECompositeArithmeticNEON_h
 #define FECompositeArithmeticNEON_h

 #if HAVE(ARM_NEON_INTRINSICS)

 #include "platform/graphics/filters/FEComposite.h"
 #include <arm_neon.h>

 namespace WebCore {

 template <int b1, int b4>
 inline void FEComposite::computeArithmeticPixelsNeon(unsigned char* source, unsigned char* destination,
     unsigned pixelArrayLength, float k1, float k2, float k3, float k4)
 {
     float32x4_t k1x4 = vdupq_n_f32(k1 / 255);
     float32x4_t k2x4 = vdupq_n_f32(k2);
     float32x4_t k3x4 = vdupq_n_f32(k3);
     float32x4_t k4x4 = vdupq_n_f32(k4 * 255);
     uint32x4_t max255 = vdupq_n_u32(255);

     uint32_t* sourcePixel = reinterpret_cast<uint32_t*>(source);
     uint32_t* destinationPixel = reinterpret_cast<uint32_t*>(destination);
     uint32_t* destinationEndPixel = destinationPixel + (pixelArrayLength >> 2);

     while (destinationPixel < destinationEndPixel) {
         uint32x2_t temporary1 = vset_lane_u32(*sourcePixel, temporary1, 0);
         uint16x4_t temporary2 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(temporary1)));
         float32x4_t sourcePixelAsFloat = vcvtq_f32_u32(vmovl_u16(temporary2));

         temporary1 = vset_lane_u32(*destinationPixel, temporary1, 0);
         temporary2 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(temporary1)));
         float32x4_t destinationPixelAsFloat = vcvtq_f32_u32(vmovl_u16(temporary2));

         float32x4_t result = vmulq_f32(sourcePixelAsFloat, k2x4);
         result = vmlaq_f32(result, destinationPixelAsFloat, k3x4);
         if (b1)
             result = vmlaq_f32(result, vmulq_f32(sourcePixelAsFloat, destinationPixelAsFloat), k1x4);
         if (b4)
             result = vaddq_f32(result, k4x4);

         // Convert result to uint so negative values are converted to zero.
         uint16x4_t temporary3 = vmovn_u32(vminq_u32(vcvtq_u32_f32(result), max255));
         uint8x8_t temporary4 = vmovn_u16(vcombine_u16(temporary3, temporary3));
         *destinationPixel++ = vget_lane_u32(vreinterpret_u32_u8(temporary4), 0);
         ++sourcePixel;
     }
 }

 inline void FEComposite::platformArithmeticNeon(unsigned char* source, unsigned char* destination,
     unsigned pixelArrayLength, float k1, float k2, float k3, float k4)
 {
     if (!k4) {
         if (!k1) {
             computeArithmeticPixelsNeon<0, 0>(source, destination, pixelArrayLength, k1, k2, k3, k4);
             return;
         }

         computeArithmeticPixelsNeon<1, 0>(source, destination, pixelArrayLength, k1, k2, k3, k4);
         return;
     }

     if (!k1) {
         computeArithmeticPixelsNeon<0, 1>(source, destination, pixelArrayLength, k1, k2, k3, k4);
         return;
     }
     computeArithmeticPixelsNeon<1, 1>(source, destination, pixelArrayLength, k1, k2, k3, k4);
 }

 } // namespace WebCore

 #endif // HAVE(ARM_NEON_INTRINSICS)

 #endif // FECompositeArithmeticNEON_h
	/*
	* Copyright (C) 2011 University of Szeged
	* Copyright (C) 2011 Felician Marton
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY
	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL UNIVERSITY OF SZEGED OR
	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#ifndef FECompositeArithmeticNEON_h
	#define FECompositeArithmeticNEON_h

	#if HAVE(ARM_NEON_INTRINSICS)

	#include "platform/graphics/filters/FEComposite.h"
	#include <arm_neon.h>

	namespace WebCore {

	template <int b1, int b4>
	inline void FEComposite::computeArithmeticPixelsNeon(unsigned char* source, unsigned char* destination,
	unsigned pixelArrayLength, float k1, float k2, float k3, float k4)
	{
	float32x4_t k1x4 = vdupq_n_f32(k1 / 255);
	float32x4_t k2x4 = vdupq_n_f32(k2);
	float32x4_t k3x4 = vdupq_n_f32(k3);
	float32x4_t k4x4 = vdupq_n_f32(k4 * 255);
	uint32x4_t max255 = vdupq_n_u32(255);

	uint32_t* sourcePixel = reinterpret_cast<uint32_t*>(source);
	uint32_t* destinationPixel = reinterpret_cast<uint32_t*>(destination);
	uint32_t* destinationEndPixel = destinationPixel + (pixelArrayLength >> 2);

	while (destinationPixel < destinationEndPixel) {
	uint32x2_t temporary1 = vset_lane_u32(*sourcePixel, temporary1, 0);
	uint16x4_t temporary2 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(temporary1)));
	float32x4_t sourcePixelAsFloat = vcvtq_f32_u32(vmovl_u16(temporary2));

	temporary1 = vset_lane_u32(*destinationPixel, temporary1, 0);
	temporary2 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(temporary1)));
	float32x4_t destinationPixelAsFloat = vcvtq_f32_u32(vmovl_u16(temporary2));

	float32x4_t result = vmulq_f32(sourcePixelAsFloat, k2x4);
	result = vmlaq_f32(result, destinationPixelAsFloat, k3x4);
	if (b1)
	result = vmlaq_f32(result, vmulq_f32(sourcePixelAsFloat, destinationPixelAsFloat), k1x4);
	if (b4)
	result = vaddq_f32(result, k4x4);

	// Convert result to uint so negative values are converted to zero.
	uint16x4_t temporary3 = vmovn_u32(vminq_u32(vcvtq_u32_f32(result), max255));
	uint8x8_t temporary4 = vmovn_u16(vcombine_u16(temporary3, temporary3));
	*destinationPixel++ = vget_lane_u32(vreinterpret_u32_u8(temporary4), 0);
	++sourcePixel;
	}
	}

	inline void FEComposite::platformArithmeticNeon(unsigned char* source, unsigned char* destination,
	unsigned pixelArrayLength, float k1, float k2, float k3, float k4)
	{
	if (!k4) {
	if (!k1) {
	computeArithmeticPixelsNeon<0, 0>(source, destination, pixelArrayLength, k1, k2, k3, k4);
	return;
	}

	computeArithmeticPixelsNeon<1, 0>(source, destination, pixelArrayLength, k1, k2, k3, k4);
	return;
	}

	if (!k1) {
	computeArithmeticPixelsNeon<0, 1>(source, destination, pixelArrayLength, k1, k2, k3, k4);
	return;
	}
	computeArithmeticPixelsNeon<1, 1>(source, destination, pixelArrayLength, k1, k2, k3, k4);
	}

	} // namespace WebCore

	#endif // HAVE(ARM_NEON_INTRINSICS)

	#endif // FECompositeArithmeticNEON_h