diff --git a/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h b/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h index 0aecb82edeb355a0f122496ed225e2a915b952f5..7676d0d0178307678cadf898925ca5a5d28c3a87 100644 --- a/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h +++ b/source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h @@ -145,19 +145,31 @@ static void simdDeriveClassificationBlk(AlfClassifier **classifier, int **laplac x6 = _mm_loadu_si128((__m128i *) &colSums[i + 4][j + 4]); x7 = (z2 == vbPos - 4) ? _mm_setzero_si128() : _mm_loadu_si128((__m128i *) &colSums[i + 5][j + 4]); - x0 = _mm_add_epi16(x0, x1); - x2 = _mm_add_epi16(x2, x3); - x4 = _mm_add_epi16(x4, x5); - x6 = _mm_add_epi16(x6, x7); - __m128i x0l = _mm_cvtepu16_epi32(x0); __m128i x0h = _mm_unpackhi_epi16(x0, _mm_setzero_si128()); + __m128i x1l = _mm_cvtepu16_epi32(x1); + __m128i x1h = _mm_unpackhi_epi16(x1, _mm_setzero_si128()); __m128i x2l = _mm_cvtepu16_epi32(x2); __m128i x2h = _mm_unpackhi_epi16(x2, _mm_setzero_si128()); + __m128i x3l = _mm_cvtepu16_epi32(x3); + __m128i x3h = _mm_unpackhi_epi16(x3, _mm_setzero_si128()); __m128i x4l = _mm_cvtepu16_epi32(x4); __m128i x4h = _mm_unpackhi_epi16(x4, _mm_setzero_si128()); + __m128i x5l = _mm_cvtepu16_epi32(x5); + __m128i x5h = _mm_unpackhi_epi16(x5, _mm_setzero_si128()); __m128i x6l = _mm_cvtepu16_epi32(x6); __m128i x6h = _mm_unpackhi_epi16(x6, _mm_setzero_si128()); + __m128i x7l = _mm_cvtepu16_epi32(x7); + __m128i x7h = _mm_unpackhi_epi16(x7, _mm_setzero_si128()); + + x0l = _mm_add_epi32(x0l, x1l); + x2l = _mm_add_epi32(x2l, x3l); + x4l = _mm_add_epi32(x4l, x5l); + x6l = _mm_add_epi32(x6l, x7l); + x0h = _mm_add_epi32(x0h, x1h); + x2h = _mm_add_epi32(x2h, x3h); + x4h = _mm_add_epi32(x4h, x5h); + x6h = _mm_add_epi32(x6h, x7h); x0l = _mm_add_epi32(x0l, x2l); x4l = _mm_add_epi32(x4l, x6l);