From 38738d6a65f7e172a8a57b7a1b41dc6003afb1c4 Mon Sep 17 00:00:00 2001
From: Vadim Seregin <vseregin@qti.qualcomm.com>
Date: Wed, 20 Oct 2021 20:10:31 -0700
Subject: [PATCH] split AVX2 SSE SIMD

---
 source/Lib/CommonLib/x86/RdCostX86.h | 47 ++++++++++++++++++----------
 1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/source/Lib/CommonLib/x86/RdCostX86.h b/source/Lib/CommonLib/x86/RdCostX86.h
index 529287f2e..0bbbe46e5 100644
--- a/source/Lib/CommonLib/x86/RdCostX86.h
+++ b/source/Lib/CommonLib/x86/RdCostX86.h
@@ -389,26 +389,41 @@ Distortion RdCost::xGetSSE_NxN_SIMD(const DistParam &rcDtParam)
     if (vext >= AVX2 && ((iCols & 15) == 0))
     {
 #ifdef USE_AVX2
-      __m256i Sum   = _mm256_setzero_si256();
       __m256i vzero = _mm256_setzero_si256();
-      for (int iY = 0; iY < iRows; iY++)
+
+      int num = 1;
+      int rows = iRows;
+
+      if( iRows > 64 )
+      {
+        num = iRows / 64;
+        rows = 64;
+      }
+
+      for( int i = 0; i < num; i++ )
       {
-        for (int iX = 0; iX < iCols; iX += 16)
+        __m256i Sum = _mm256_setzero_si256();
+
+        for( int iY = 0; iY < rows; iY++ )
         {
-          __m256i Src1 = (sizeof(Torg) > 1) ? (_mm256_lddqu_si256((__m256i *)(&pSrc1[iX]))) : (_mm256_unpacklo_epi8(_mm256_permute4x64_epi64(_mm256_castsi128_si256(_mm_lddqu_si128((__m128i *) (&pSrc1[iX]))), 0xD8), vzero));
-          __m256i Src2 = (sizeof(Tcur) > 1) ? (_mm256_lddqu_si256((__m256i *)(&pSrc2[iX]))) : (_mm256_unpacklo_epi8(_mm256_permute4x64_epi64(_mm256_castsi128_si256(_mm_lddqu_si128((__m128i *) (&pSrc2[iX]))), 0xD8), vzero));
-          __m256i Diff = _mm256_sub_epi16(Src1, Src2);
-          __m256i Res  = _mm256_madd_epi16(Diff, Diff);
-          Sum          = _mm256_add_epi32(Sum, Res);
+          for( int iX = 0; iX < iCols; iX += 16 )
+          {
+            __m256i Src1 = (sizeof( Torg ) > 1) ? (_mm256_lddqu_si256( (__m256i *)(&pSrc1[iX]) )) : (_mm256_unpacklo_epi8( _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_lddqu_si128( (__m128i *) (&pSrc1[iX]) ) ), 0xD8 ), vzero ));
+            __m256i Src2 = (sizeof( Tcur ) > 1) ? (_mm256_lddqu_si256( (__m256i *)(&pSrc2[iX]) )) : (_mm256_unpacklo_epi8( _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_lddqu_si128( (__m128i *) (&pSrc2[iX]) ) ), 0xD8 ), vzero ));
+            __m256i Diff = _mm256_sub_epi16( Src1, Src2 );
+            __m256i Res  = _mm256_madd_epi16( Diff, Diff );
+            Sum          = _mm256_add_epi32( Sum, Res );
+          }
+          pSrc1 += iStrideSrc1;
+          pSrc2 += iStrideSrc2;
         }
-        pSrc1 += iStrideSrc1;
-        pSrc2 += iStrideSrc2;
-      }
 
-      Sum   = _mm256_add_epi64(_mm256_unpacklo_epi32(Sum, vzero), _mm256_unpackhi_epi32(Sum, vzero));
-      Sum   = _mm256_add_epi64(Sum, _mm256_permute4x64_epi64(Sum, 14));
-      Sum   = _mm256_add_epi64(Sum, _mm256_permute4x64_epi64(Sum, 1));
-      uiRet = _mm_cvtsi128_si64(_mm256_castsi256_si128(Sum)) >> uiShift;
+        Sum = _mm256_add_epi64( _mm256_unpacklo_epi32( Sum, vzero ), _mm256_unpackhi_epi32( Sum, vzero ) );
+        Sum = _mm256_add_epi64( Sum, _mm256_permute4x64_epi64( Sum, 14 ) );
+        Sum = _mm256_add_epi64( Sum, _mm256_permute4x64_epi64( Sum, 1 ) );
+        uiRet += _mm_cvtsi128_si64( _mm256_castsi256_si128( Sum ) ) >> uiShift;
+      }
+      uiRet >>= uiShift;
 #endif
     }
     else if (iCols > 32 && (iCols & 15) == 0)
@@ -2578,7 +2593,7 @@ void RdCost::_initRdCostX86()
   m_afpDistortFunc[DF_SSE16] = xGetSSE_NxN_SIMD<16, vext>;
   m_afpDistortFunc[DF_SSE32] = xGetSSE_NxN_SIMD<32, vext>;
   m_afpDistortFunc[DF_SSE64] = xGetSSE_NxN_SIMD<64, vext>;
-#if DIST_SSE_ENABLE && CTU_256 && 0
+#if CTU_256
   m_afpDistortFunc[DF_SSE16N] = xGetSSE_NxN_SIMD<vext>;
 #else
   m_afpDistortFunc[DF_SSE16N] = xGetSSE_NxN_SIMD<128, vext>;
-- 
GitLab