blob: ea3fd0be7493e7069bc4eb87db65fea7e649a009 [file] [log] [blame]
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef CAFFE2_UTILS_CPU_NEON_H_
#define CAFFE2_UTILS_CPU_NEON_H_
// Provides a variety of ARM NEON-specific utility functions
#ifdef __ARM_NEON__
#include <arm_neon.h>
namespace caffe2 {
template <typename T>
inline bool isPointerAligned(T* p, size_t align) {
return (reinterpret_cast<uintptr_t>(p) % align == 0);
}
inline float32x4_t vert_sum_f32(float32x4_t v0,
float32x4_t v1,
float32x4_t v2,
float32x4_t v3) {
v0 = vaddq_f32(v0, v1);
v2 = vaddq_f32(v2, v3);
return vaddq_f32(v0, v2);
}
inline float horizontal_sum_f32(float32x4_t v0,
float32x4_t v1,
float32x4_t v2,
float32x4_t v3) {
v0 = vert_sum_f32(v0, v1, v2, v3);
float32x2_t v = vadd_f32(vget_high_f32(v0), vget_low_f32(v0));
return vget_lane_f32(vpadd_f32(v, v), 0);
}
// Load/store functions that assume alignment
inline float32x4_t vld1q_f32_aligned(const float* p) {
return vld1q_f32((const float*)
__builtin_assume_aligned(p, sizeof(float32x4_t)));
}
inline void vst1q_f32_aligned(float* p, float32x4_t v) {
vst1q_f32((float*) __builtin_assume_aligned(p, sizeof(float32x4_t)), v);
}
inline void vst4_u8_aligned(uint8_t* p, uint8x8x4_t v) {
vst4_u8((uint8_t*)
__builtin_assume_aligned(p, sizeof(uint8x8x4_t)), v);
}
} // namespace caffe2
#endif // __ARM_NEON__
#endif // CAFFE2_UTILS_CPU_NEON_H_