diff --git a/source/Lib/CommonLib/x86/InterpolationFilterX86.h b/source/Lib/CommonLib/x86/InterpolationFilterX86.h index 9363438b69b667f1836802e79436159cbdcbae26..0d7421f2abd9850fddcfa308b2b077dc8ed3298d 100644 --- a/source/Lib/CommonLib/x86/InterpolationFilterX86.h +++ b/source/Lib/CommonLib/x86/InterpolationFilterX86.h @@ -1249,17 +1249,19 @@ void xWeightedTriangleBlk_SSE(const PredictionUnit &pu, const uint32_t width, co { for (int y = 0; y < height; y++) { - __m128i s0 = _mm_loadl_epi64((__m128i *) (src0)); - __m128i s1 = _mm_loadl_epi64((__m128i *) (src1)); - __m128i w0 = _mm_loadl_epi64((__m128i *) (weight)); + __m128i s0 = _mm_cvtsi32_si128(*(uint32_t *) src0); + __m128i s1 = _mm_cvtsi32_si128(*(uint32_t *) src1); + __m128i w0 = _mm_cvtsi32_si128(*(uint32_t *) weight); __m128i w1 = _mm_sub_epi16(mmEight, w0); + s0 = _mm_unpacklo_epi16(s0, s1); w0 = _mm_unpacklo_epi16(w0, w1); s0 = _mm_add_epi32(_mm_madd_epi16(s0, w0), mmOffset); s0 = _mm_sra_epi32(s0, mmShift); s0 = _mm_packs_epi32(s0, s0); s0 = _mm_min_epi16(mmMax, _mm_max_epi16(s0, mmMin)); - *(int*)(dst) = _mm_cvtsi128_si32(s0); + + *(uint32_t *) dst = _mm_cvtsi128_si32(s0); dst += strideDst; src0 += strideSrc0; src1 += strideSrc1;