From 5302bc0681ef279f5ec668b7b18d26adf880c76d Mon Sep 17 00:00:00 2001 From: Frank Bossen <fbossen@gmail.com> Date: Thu, 3 Oct 2019 16:53:04 +0200 Subject: [PATCH] Silence valgrind warnings Reduce the amount of data being loaded to the strict minimum --- source/Lib/CommonLib/x86/BufferX86.h | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/source/Lib/CommonLib/x86/BufferX86.h b/source/Lib/CommonLib/x86/BufferX86.h index 4a64ef834..7d8771174 100644 --- a/source/Lib/CommonLib/x86/BufferX86.h +++ b/source/Lib/CommonLib/x86/BufferX86.h @@ -277,12 +277,20 @@ void calcBIOSums_SSE(const Pel* srcY0Tmp, const Pel* srcY1Tmp, Pel* gradX0, Pel* for (int y = 0; y < 6; y++) { - __m128i shiftSrcY0Tmp = _mm_srai_epi16(_mm_loadu_si128((__m128i*)(srcY0Tmp)), shift4); - __m128i shiftSrcY1Tmp = _mm_srai_epi16(_mm_loadu_si128((__m128i*)(srcY1Tmp)), shift4); - __m128i loadGradX0 = _mm_loadu_si128((__m128i*)(gradX0)); - __m128i loadGradX1 = _mm_loadu_si128((__m128i*)(gradX1)); - __m128i loadGradY0 = _mm_loadu_si128((__m128i*)(gradY0)); - __m128i loadGradY1 = _mm_loadu_si128((__m128i*)(gradY1)); + // Note: loading 8 values also works, but valgrind doesn't like it + auto load6values = [](const Pel *ptr) { + __m128i a = _mm_loadl_epi64((const __m128i *) ptr); + __m128i b = _mm_cvtsi32_si128(*(uint32_t *) (ptr + 4)); + return _mm_unpacklo_epi64(a, b); + }; + + __m128i shiftSrcY0Tmp = _mm_srai_epi16(load6values(srcY0Tmp), shift4); + __m128i shiftSrcY1Tmp = _mm_srai_epi16(load6values(srcY1Tmp), shift4); + __m128i loadGradX0 = load6values(gradX0); + __m128i loadGradX1 = load6values(gradX1); + __m128i loadGradY0 = load6values(gradY0); + __m128i loadGradY1 = load6values(gradY1); + __m128i subTemp1 = _mm_sub_epi16(shiftSrcY1Tmp, shiftSrcY0Tmp); __m128i packTempX = _mm_srai_epi16(_mm_add_epi16(loadGradX0, loadGradX1), shift5); __m128i packTempY = _mm_srai_epi16(_mm_add_epi16(loadGradY0, loadGradY1), shift5); -- GitLab