use stream loading technology to optimize memory copy
BZ: 181818
use stream loading technology to optimize memory copy during video editor.
Change-Id: Idfeca8e985039a0145832d429fb8d3dce7b7a008
Signed-off-by: Sun, Jian <jianx.sun@intel.com>
diff --git a/videodecoder/VideoDecoderBase.cpp b/videodecoder/VideoDecoderBase.cpp
index bf2a46e..7d1cd13 100644
--- a/videodecoder/VideoDecoderBase.cpp
+++ b/videodecoder/VideoDecoderBase.cpp
@@ -27,6 +27,9 @@
#include <string.h>
#include <va/va_android.h>
#include <va/va_tpi.h>
+#ifdef __SSE4_1__
+#include "use_util_sse4.h"
+#endif
#define INVALID_PTS ((uint64_t)-1)
#define MAXIMUM_POC 0x7FFFFFFF
@@ -1165,21 +1168,33 @@
}
if (size == (int32_t)vaImage.data_size) {
+#ifdef __SSE4_1__
+ stream_memcpy(pRawData, pBuf, size);
+#else
memcpy(pRawData, pBuf, size);
+#endif
} else {
// copy Y data
uint8_t *src = (uint8_t*)pBuf;
uint8_t *dst = pRawData;
int32_t row = 0;
for (row = 0; row < cropHeight; row++) {
+#ifdef __SSE4_1__
+ stream_memcpy(dst, src, cropWidth);
+#else
memcpy(dst, src, cropWidth);
+#endif
dst += cropWidth;
src += vaImage.pitches[0];
}
// copy interleaved V and U data
src = (uint8_t*)pBuf + vaImage.offsets[1];
for (row = 0; row < cropHeight / 2; row++) {
+#ifdef __SSE4_1__
+ stream_memcpy(dst, src, cropWidth);
+#else
memcpy(dst, src, cropWidth);
+#endif
dst += cropWidth;
src += vaImage.pitches[1];
}
diff --git a/videodecoder/use_util_sse4.h b/videodecoder/use_util_sse4.h
new file mode 100644
index 0000000..073ae22
--- /dev/null
+++ b/videodecoder/use_util_sse4.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ * Li Zeng <li.zeng@intel.com>
+ * Jian Sun <jianx.sun@intel.com>
+ */
+
+#include <emmintrin.h>
+#include <x86intrin.h>
+
+inline void stream_memcpy(void* dst_buff, const void* src_buff, size_t size)
+{
+ bool isAligned = (((size_t)(src_buff) | (size_t)(dst_buff)) & 0xF) == 0;
+ if (!isAligned) {
+ memcpy(dst_buff, src_buff, size);
+ }
+
+ static const size_t regs_count = 8;
+
+ __m128i xmm_data0, xmm_data1, xmm_data2, xmm_data3;
+ __m128i xmm_data4, xmm_data5, xmm_data6, xmm_data7;
+
+ size_t remain_data = size & (regs_count * sizeof(xmm_data0) - 1);
+ size_t end_position = 0;
+
+ __m128i* pWb_buff = (__m128i*)dst_buff;
+ __m128i* pWb_buff_end = pWb_buff + ((size - remain_data) >> 4);
+ __m128i* pWc_buff = (__m128i*)src_buff;
+
+ /*sync the wc memory data*/
+ _mm_mfence();
+
+ while (pWb_buff < pWb_buff_end)
+ {
+ xmm_data0 = _mm_stream_load_si128(pWc_buff);
+ xmm_data1 = _mm_stream_load_si128(pWc_buff + 1);
+ xmm_data2 = _mm_stream_load_si128(pWc_buff + 2);
+ xmm_data3 = _mm_stream_load_si128(pWc_buff + 3);
+ xmm_data4 = _mm_stream_load_si128(pWc_buff + 4);
+ xmm_data5 = _mm_stream_load_si128(pWc_buff + 5);
+ xmm_data6 = _mm_stream_load_si128(pWc_buff + 6);
+ xmm_data7 = _mm_stream_load_si128(pWc_buff + 7);
+
+ pWc_buff += regs_count;
+ _mm_store_si128(pWb_buff, xmm_data0);
+ _mm_store_si128(pWb_buff + 1, xmm_data1);
+ _mm_store_si128(pWb_buff + 2, xmm_data2);
+ _mm_store_si128(pWb_buff + 3, xmm_data3);
+ _mm_store_si128(pWb_buff + 4, xmm_data4);
+ _mm_store_si128(pWb_buff + 5, xmm_data5);
+ _mm_store_si128(pWb_buff + 6, xmm_data6);
+ _mm_store_si128(pWb_buff + 7, xmm_data7);
+
+ pWb_buff += regs_count;
+ }
+
+ /*copy data by 16 bytes step from the remainder*/
+ if (remain_data >= 16)
+ {
+ size = remain_data;
+ remain_data = size & 15;
+ end_position = size >> 4;
+ for (size_t i = 0; i < end_position; ++i)
+ {
+ pWb_buff[i] = _mm_stream_load_si128(pWc_buff + i);
+ }
+ }
+
+ /*copy the remainder data, if it still existed*/
+ if (remain_data)
+ {
+ __m128i temp_data = _mm_stream_load_si128(pWc_buff + end_position);
+
+ char* psrc_buf = (char*)(&temp_data);
+ char* pdst_buf = (char*)(pWb_buff + end_position);
+
+ for (size_t i = 0; i < remain_data; ++i)
+ {
+ pdst_buf[i] = psrc_buf[i];
+ }
+ }
+
+}