From b7ad1fdad7e127881404e015ef7342889a2893c5 Mon Sep 17 00:00:00 2001
From: Frank Bossen <fbossen@gmail.com>
Date: Mon, 28 Aug 2023 09:59:54 +0200
Subject: [PATCH] Clean up AffineGradientSearch SIMD code

---
 .../CommonLib/x86/AffineGradientSearchX86.h   | 955 +++++++-----------
 1 file changed, 384 insertions(+), 571 deletions(-)

diff --git a/source/Lib/CommonLib/x86/AffineGradientSearchX86.h b/source/Lib/CommonLib/x86/AffineGradientSearchX86.h
index 9aa8524f1..0fe9fd8b7 100644
--- a/source/Lib/CommonLib/x86/AffineGradientSearchX86.h
+++ b/source/Lib/CommonLib/x86/AffineGradientSearchX86.h
@@ -31,11 +31,6 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/**
- * \file
- * \brief Implementation of AffineGradientSearch class
- */
-//#define USE_AVX2
 // ====================================================================================================================
 // Includes
 // ====================================================================================================================
@@ -43,9 +38,6 @@
 #include "CommonDefX86.h"
 #include "../AffineGradientSearch.h"
 
-//! \ingroup CommonLib
-//! \{
-
 #ifdef TARGET_SIMD_X86
 
 #if defined _MSC_VER
@@ -54,272 +46,369 @@
 #include <immintrin.h>
 #endif
 
-#define CALC_EQUAL_COEFF_8PXLS(x1,x2,y1,y2,tmp0,tmp1,tmp2,tmp3,inter0,inter1,inter2,inter3,loadLocation)       \
-{                                                                                                              \
-inter0 = _mm_mul_epi32(x1, y1);                                                                                \
-inter1 = _mm_mul_epi32(tmp0, tmp2);                                                                            \
-inter2 = _mm_mul_epi32(x2, y2);                                                                                \
-inter3 = _mm_mul_epi32(tmp1, tmp3);                                                                            \
-inter2 = _mm_add_epi64(inter0, inter2);                                                                        \
-inter3 = _mm_add_epi64(inter1, inter3);                                                                        \
-inter0 = _mm_loadl_epi64(loadLocation);                                                                        \
-inter3 = _mm_add_epi64(inter2, inter3);                                                                        \
-inter1 = _mm_srli_si128(inter3, 8);                                                                            \
-inter3 = _mm_add_epi64(inter1, inter3);                                                                        \
-inter3 = _mm_add_epi64(inter0, inter3);                                                                        \
+static inline void sobelPadTopBottom(int32_t* dst, const ptrdiff_t dstStride, const int width, const int height)
+{
+  int* firstRow = dst;
+  std::copy_n(firstRow + dstStride, width, firstRow);
+
+  int* lastRow = dst + (height - 1) * dstStride;
+  std::copy_n(lastRow - dstStride, width, lastRow);
 }
 
-template<X86_VEXT vext>
-static void simdHorizontalSobelFilter(Pel *const pPred, const ptrdiff_t predStride, int *const pDerivate,
-                                      const ptrdiff_t derivateBufStride, const int width, const int height)
+template<X86_VEXT vext, bool EDGES>
+static inline void simdHorizontalSobelFilter4(const Pel* src, const ptrdiff_t srcStride, int32_t* dst,
+                                              const ptrdiff_t dstStride, const int widthOrCol, const int height)
 {
-  __m128i mmPred[4];
-  __m128i mm2xPred[2];
-  __m128i mmIntermediates[4];
-  __m128i mmDerivate[2];
+  // 'widthOrCol' contains 'width' if 'EDGES' is true, and 'col' otherwise
 
-  assert( !(height % 2) );
-  assert( !(width % 4) );
-
-  /* Derivates of the rows and columns at the boundary are done at the end of this function */
-  /* The value of col and row indicate the columns and rows for which the derivates have already been computed */
-  for ( int col = 1; (col + 2) < width; col += 2 )
-  {
-    mmPred[0] = _mm_loadl_epi64( reinterpret_cast<const __m128i *>(&pPred[0 * predStride + col - 1]) );
-    mmPred[1] = _mm_loadl_epi64( reinterpret_cast<const __m128i *>(&pPred[1 * predStride + col - 1]) );
+  const __m128i s = EDGES ? _mm_setr_epi8(0, 1, 4, 5, 0, 1, 4, 5, 10, 11, 14, 15, 10, 11, 14, 15)
+                          : _mm_setr_epi8(2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13);
+  const __m128i m = _mm_setr_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
 
-    mmPred[0] = _mm_cvtepi16_epi32( mmPred[0] );
-    mmPred[1] = _mm_cvtepi16_epi32( mmPred[1] );
+  __m128i x[4];   // source rows
 
-    for ( int row = 1; row < (height - 1); row += 2 )
+  for (int k = 0; k < 2; k++)
+  {
+    if constexpr (EDGES)
     {
-      mmPred[2] = _mm_loadl_epi64( reinterpret_cast<const __m128i *>(&pPred[(row + 1) * predStride + col - 1]) );
-      mmPred[3] = _mm_loadl_epi64( reinterpret_cast<const __m128i *>(&pPred[(row + 2) * predStride + col - 1]) );
-
-      mmPred[2] = _mm_cvtepi16_epi32( mmPred[2] );
-      mmPred[3] = _mm_cvtepi16_epi32( mmPred[3] );
-
-      mm2xPred[0] = _mm_slli_epi32( mmPred[1], 1 );
-      mm2xPred[1] = _mm_slli_epi32( mmPred[2], 1 );
+      __m128i l = _mm_loadl_epi64((const __m128i*) (src + k * srcStride));
+      __m128i r = _mm_loadl_epi64((const __m128i*) (src + k * srcStride + widthOrCol - 4));
+      x[k]      = _mm_unpacklo_epi64(l, r);
+    }
+    else
+    {
+      x[k] = _mm_loadu_si128((const __m128i*) (src + k * srcStride + widthOrCol - 2));
+    }
 
-      mmIntermediates[0] = _mm_add_epi32( mm2xPred[0], mmPred[0] );
-      mmIntermediates[2] = _mm_add_epi32( mm2xPred[1], mmPred[1] );
+    x[k] = _mm_shuffle_epi8(x[k], s);
+    x[k] = _mm_madd_epi16(x[k], m);
+  }
 
-      mmIntermediates[0] = _mm_add_epi32( mmIntermediates[0], mmPred[2] );
-      mmIntermediates[2] = _mm_add_epi32( mmIntermediates[2], mmPred[3] );
+  for (int row = 1; row < height - 1; row += 2)
+  {
+    for (int k = 2; k < 4; k++)
+    {
+      if constexpr (EDGES)
+      {
+        __m128i l = _mm_loadl_epi64((const __m128i*) (src + (row + k - 1) * srcStride));
+        __m128i r = _mm_loadl_epi64((const __m128i*) (src + (row + k - 1) * srcStride + widthOrCol - 4));
+        x[k]      = _mm_unpacklo_epi64(l, r);
+      }
+      else
+      {
+        x[k] = _mm_loadu_si128((const __m128i*) (src + (row + k - 1) * srcStride + widthOrCol - 2));
+      }
 
-      mmPred[0] = mmPred[2];
-      mmPred[1] = mmPred[3];
+      x[k] = _mm_shuffle_epi8(x[k], s);
+      x[k] = _mm_madd_epi16(x[k], m);
+    }
 
-      mmIntermediates[1] = _mm_srli_si128( mmIntermediates[0], 8 );
-      mmIntermediates[3] = _mm_srli_si128( mmIntermediates[2], 8 );
+    __m128i sum[3];
+    for (int i = 0; i < 3; i++)
+    {
+      sum[i] = _mm_add_epi32(x[i], x[i + 1]);
+    }
 
-      mmDerivate[0] = _mm_sub_epi32( mmIntermediates[1], mmIntermediates[0] );
-      mmDerivate[1] = _mm_sub_epi32( mmIntermediates[3], mmIntermediates[2] );
+    for (int k = 0; k < 2; k++)
+    {
+      __m128i r = _mm_add_epi32(sum[k], sum[k + 1]);
+      if constexpr (EDGES)
+      {
+        _mm_storel_epi64((__m128i*) (dst + (row + k) * dstStride), r);
+        _mm_storel_epi64((__m128i*) (dst + (row + k) * dstStride + widthOrCol - 2), _mm_unpackhi_epi64(r, r));
+      }
+      else
+      {
+        _mm_storeu_si128((__m128i*) (dst + (row + k) * dstStride + widthOrCol), r);
+      }
 
-      _mm_storel_epi64( reinterpret_cast<__m128i *> (&pDerivate[col + (row + 0) * derivateBufStride]), mmDerivate[0] );
-      _mm_storel_epi64( reinterpret_cast<__m128i *> (&pDerivate[col + (row + 1) * derivateBufStride]), mmDerivate[1] );
+      x[k] = x[k + 2];
     }
   }
+}
+
+template<X86_VEXT vext> static void simdHorizontalSobelFilter(Pel* const src, const ptrdiff_t srcStride, int* const dst,
+                                                              const ptrdiff_t dstStride, const int width,
+                                                              const int height)
+{
+  CHECK(height % 2 != 0, "height must be even");
+  CHECK(width % 4 != 0, "width must be a multiple of 4");
+
+  simdHorizontalSobelFilter4<vext, true>(src, srcStride, dst, dstStride, width, height);
 
-  for ( int j = 1; j < (height - 1); j++ )
+  for (int col = 2; col < width - 2; col += 4)
   {
-    pDerivate[j * derivateBufStride] = pDerivate[j * derivateBufStride + 1];
-    pDerivate[j * derivateBufStride + (width - 1)] = pDerivate[j * derivateBufStride + (width - 2)];
+    simdHorizontalSobelFilter4<vext, false>(src, srcStride, dst, dstStride, col, height);
   }
 
-  memcpy( pDerivate, pDerivate + derivateBufStride, width * sizeof( pDerivate[0] ) );
-  memcpy( pDerivate + (height - 1) * derivateBufStride, pDerivate + (height - 2) * derivateBufStride, width * sizeof( pDerivate[0] )
-  );
+  sobelPadTopBottom(dst, dstStride, width, height);
 }
 
-template<X86_VEXT vext>
-static void simdVerticalSobelFilter(Pel *const pPred, const ptrdiff_t predStride, int *const pDerivate,
-                                    const ptrdiff_t derivateBufStride, const int width, const int height)
+template<X86_VEXT vext, bool EDGES>
+static inline void simdVerticalSobelFilter4(const Pel* src, const ptrdiff_t srcStride, int32_t* dst,
+                                            const ptrdiff_t dstStride, const int widthOrCol, const int height)
 {
-  __m128i mmPred[4];
-  __m128i mmIntermediates[6];
-  __m128i mmDerivate[2];
+  // 'widthOrCol' contains 'width' if 'EDGES' is true, and 'col' otherwise
 
-  assert( !(height % 2) );
-  assert( !(width % 4) );
+  const __m128i s0 = EDGES ? _mm_setr_epi8(0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 12, 13, 14, 15)
+                           : _mm_setr_epi8(2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11);
+  const __m128i s1 = EDGES ? _mm_setr_epi8(2, 3, 4, 5, 2, 3, 4, 5, 10, 11, 12, 13, 10, 11, 12, 13)
+                           : _mm_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
+  const __m128i m  = _mm_set1_epi16(1);
 
-  /* Derivates of the rows and columns at the boundary are done at the end of this function */
-  /* The value of col and row indicate the columns and rows for which the derivates have already been computed */
-  for ( int col = 1; col < (width - 1); col += 2 )
-  {
-    mmPred[0] = _mm_loadl_epi64( reinterpret_cast<const __m128i *>(&pPred[0 * predStride + col - 1]) );
-    mmPred[1] = _mm_loadl_epi64( reinterpret_cast<const __m128i *>(&pPred[1 * predStride + col - 1]) );
-
-    mmPred[0] = _mm_cvtepi16_epi32( mmPred[0] );
-    mmPred[1] = _mm_cvtepi16_epi32( mmPred[1] );
+  __m128i x[4];   // source rows
 
-    for ( int row = 1; row < (height - 1); row += 2 )
+  for (int k = 0; k < 2; k++)
+  {
+    if constexpr (EDGES)
     {
-      mmPred[2] = _mm_loadl_epi64( reinterpret_cast<const __m128i *>(&pPred[(row + 1) * predStride + col - 1]) );
-      mmPred[3] = _mm_loadl_epi64( reinterpret_cast<const __m128i *>(&pPred[(row + 2) * predStride + col - 1]) );
-
-      mmPred[2] = _mm_cvtepi16_epi32( mmPred[2] );
-      mmPred[3] = _mm_cvtepi16_epi32( mmPred[3] );
+      __m128i l = _mm_loadl_epi64((const __m128i*) (src + k * srcStride));
+      __m128i r = _mm_loadl_epi64((const __m128i*) (src + k * srcStride + widthOrCol - 4));
+      x[k]      = _mm_unpacklo_epi64(l, r);
+    }
+    else
+    {
+      x[k] = _mm_loadu_si128((const __m128i*) (src + k * srcStride + widthOrCol - 2));
+    }
 
-      mmIntermediates[0] = _mm_sub_epi32( mmPred[2], mmPred[0] );
-      mmIntermediates[3] = _mm_sub_epi32( mmPred[3], mmPred[1] );
+    __m128i tmp = x[k];
 
-      mmPred[0] = mmPred[2];
-      mmPred[1] = mmPred[3];
+    tmp  = _mm_shuffle_epi8(tmp, s0);
+    tmp  = _mm_madd_epi16(tmp, m);
+    x[k] = _mm_shuffle_epi8(x[k], s1);
+    x[k] = _mm_madd_epi16(x[k], m);
+    x[k] = _mm_add_epi32(x[k], tmp);
+  }
 
-      mmIntermediates[1] = _mm_srli_si128( mmIntermediates[0], 4 );
-      mmIntermediates[4] = _mm_srli_si128( mmIntermediates[3], 4 );
-      mmIntermediates[2] = _mm_srli_si128( mmIntermediates[0], 8 );
-      mmIntermediates[5] = _mm_srli_si128( mmIntermediates[3], 8 );
+  for (int row = 1; row < height - 1; row += 2)
+  {
+    for (int k = 2; k < 4; k++)
+    {
+      if constexpr (EDGES)
+      {
+        __m128i l = _mm_loadl_epi64((const __m128i*) (src + (row + k - 1) * srcStride));
+        __m128i r = _mm_loadl_epi64((const __m128i*) (src + (row + k - 1) * srcStride + widthOrCol - 4));
+        x[k]      = _mm_unpacklo_epi64(l, r);
+      }
+      else
+      {
+        x[k] = _mm_loadu_si128((const __m128i*) (src + (row + k - 1) * srcStride + widthOrCol - 2));
+      }
 
-      mmIntermediates[1] = _mm_slli_epi32( mmIntermediates[1], 1 );
-      mmIntermediates[4] = _mm_slli_epi32( mmIntermediates[4], 1 );
+      __m128i tmp = x[k];
 
-      mmIntermediates[0] = _mm_add_epi32( mmIntermediates[0], mmIntermediates[2] );
-      mmIntermediates[3] = _mm_add_epi32( mmIntermediates[3], mmIntermediates[5] );
+      tmp  = _mm_shuffle_epi8(tmp, s0);
+      tmp  = _mm_madd_epi16(tmp, m);
+      x[k] = _mm_shuffle_epi8(x[k], s1);
+      x[k] = _mm_madd_epi16(x[k], m);
+      x[k] = _mm_add_epi32(x[k], tmp);
+    }
 
-      mmDerivate[0] = _mm_add_epi32( mmIntermediates[0], mmIntermediates[1] );
-      mmDerivate[1] = _mm_add_epi32( mmIntermediates[3], mmIntermediates[4] );
+    for (int k = 0; k < 2; k++)
+    {
+      __m128i r = _mm_sub_epi32(x[k + 2], x[k]);
+      if constexpr (EDGES)
+      {
+        _mm_storel_epi64((__m128i*) (dst + (row + k) * dstStride), r);
+        _mm_storel_epi64((__m128i*) (dst + (row + k) * dstStride + widthOrCol - 2), _mm_unpackhi_epi64(r, r));
+      }
+      else
+      {
+        _mm_storeu_si128((__m128i*) (dst + (row + k) * dstStride + widthOrCol), r);
+      }
 
-      _mm_storel_epi64( reinterpret_cast<__m128i *> (&pDerivate[col + (row + 0) * derivateBufStride]), mmDerivate[0] );
-      _mm_storel_epi64( reinterpret_cast<__m128i *> (&pDerivate[col + (row + 1) * derivateBufStride]), mmDerivate[1] );
+      x[k] = x[k + 2];
     }
   }
+}
+
+template<X86_VEXT vext> static void simdVerticalSobelFilter(Pel* const src, const ptrdiff_t srcStride, int* const dst,
+                                                            const ptrdiff_t dstStride, const int width,
+                                                            const int height)
+{
+  CHECK(height % 2 != 0, "height must be even");
+  CHECK(width % 2 != 0, "width must be even");
 
-  for ( int j = 1; j < (height - 1); j++ )
+  simdVerticalSobelFilter4<vext, true>(src, srcStride, dst, dstStride, width, height);
+
+  for (int col = 2; col < width - 2; col += 4)
   {
-    pDerivate[j * derivateBufStride] = pDerivate[j * derivateBufStride + 1];
-    pDerivate[j * derivateBufStride + (width - 1)] = pDerivate[j * derivateBufStride + (width - 2)];
+    simdVerticalSobelFilter4<vext, false>(src, srcStride, dst, dstStride, col, height);
   }
 
-  memcpy( pDerivate, pDerivate + derivateBufStride, width * sizeof( pDerivate[0] ) );
-  memcpy( pDerivate + (height - 1) * derivateBufStride, pDerivate + (height - 2) * derivateBufStride, width * sizeof( pDerivate[0] ) );
+  sobelPadTopBottom(dst, dstStride, width, height);
 }
 
-template<X86_VEXT vext>
-static void simdEqualCoeffComputer(Pel *pResidue, ptrdiff_t residueStride, int **ppDerivate,
-                                   ptrdiff_t derivateBufStride, int64_t (*pEqualCoeff)[7], int width, int height,
-                                   bool b6Param)
+static inline void calcEqualCoeff8Pxls(const __m128i x[2], const __m128i y[2], __m128i* dst)
 {
-  __m128i mmFour;
-  __m128i mmTmp[4];
-  __m128i mmIntermediate[4];
-  __m128i mmIndxK, mmIndxJ;
-  __m128i mmResidue[2];
-  __m128i mmC[12];
+  __m128i sum = _mm_setzero_si128();
 
-  // Add directly to indexes to get new index
-  mmFour = _mm_set1_epi32(4);
-  mmIndxJ = _mm_set1_epi32(-2);
+  sum = _mm_add_epi64(sum, _mm_mul_epi32(x[0], y[0]));
+  sum = _mm_add_epi64(sum, _mm_mul_epi32(_mm_srli_si128(x[0], 4), _mm_srli_si128(y[0], 4)));
+  sum = _mm_add_epi64(sum, _mm_mul_epi32(x[1], y[1]));
+  sum = _mm_add_epi64(sum, _mm_mul_epi32(_mm_srli_si128(x[1], 4), _mm_srli_si128(y[1], 4)));
 
+  __m128i accum = _mm_loadl_epi64(dst);
+  accum         = _mm_add_epi64(accum, sum);
+  accum         = _mm_add_epi64(accum, _mm_shuffle_epi32(accum, _MM_SHUFFLE(1, 0, 3, 2)));
+  _mm_storel_epi64(dst, accum);
+}
 
-  int n = b6Param ? 6 : 4;
-  ptrdiff_t idx1 = 0, idx2 = 0;
-  idx1 = -2 * derivateBufStride - 4;
-  idx2 = -derivateBufStride - 4;
+#if USE_AVX2
+static inline void calcEqualCoeff16Pxls(const __m256i x[2], const __m256i y[2], __m128i* dst)
+{
+  __m256i sum = _mm256_setzero_si256();
+
+  sum = _mm256_add_epi64(sum, _mm256_mul_epi32(x[0], y[0]));
+  sum = _mm256_add_epi64(sum, _mm256_mul_epi32(_mm256_srli_si256(x[0], 4), _mm256_srli_si256(y[0], 4)));
+  sum = _mm256_add_epi64(sum, _mm256_mul_epi32(x[1], y[1]));
+  sum = _mm256_add_epi64(sum, _mm256_mul_epi32(_mm256_srli_si256(x[1], 4), _mm256_srli_si256(y[1], 4)));
+
+  __m128i accum = _mm_loadl_epi64(dst);
+  accum         = _mm_add_epi64(accum, _mm256_castsi256_si128(sum));
+  accum         = _mm_add_epi64(accum, _mm256_extracti128_si256(sum, 1));
+  accum         = _mm_add_epi64(accum, _mm_shuffle_epi32(accum, _MM_SHUFFLE(1, 0, 3, 2)));
+  _mm_storel_epi64(dst, accum);
+}
+#endif
 
-  for ( int j = 0; j < height; j += 2 )
-  {
-    if (!(j & 3))
-      mmIndxJ = _mm_add_epi32(mmIndxJ, mmFour);
-    mmIndxK = _mm_set1_epi32(-2);
-    idx1 += (derivateBufStride << 1);
-    idx2 += (derivateBufStride << 1);
+template<X86_VEXT vext> static void simdEqualCoeffComputer(Pel* target, ptrdiff_t targetStride, int** grad,
+                                                           ptrdiff_t gradStride, int64_t (*cov)[7], int width,
+                                                           int height, bool b6Param)
+{
+  const int n = b6Param ? 6 : 4;
+  CHECK(targetStride != gradStride, "stride mismatch");
+
+  const int* gradHor = grad[0];
+  const int* gradVer = grad[1];
 
-    for ( int k = 0; k < width; k += 4 )
+#if USE_AVX2
+  if constexpr (vext >= AVX2)
+  {
+    for (int j = 0; j < height; j += 2)
     {
-      idx1 += 4;
-      idx2 += 4;
-      mmIndxK = _mm_add_epi32( mmIndxK, mmFour );
+      const __m256i mmIndxJ = _mm256_set1_epi32(j | 2);
 
-      if ( b6Param )
-      {
-        // mmC[0-5] for iC[0-5] of 1st row of pixels
-        mmC[0] = _mm_loadu_si128( (const __m128i*)&ppDerivate[0][idx1] );
-        mmC[2] = _mm_loadu_si128( (const __m128i*)&ppDerivate[1][idx1] );
-        mmC[1] = _mm_mullo_epi32( mmIndxK, mmC[0] );
-        mmC[3] = _mm_mullo_epi32( mmIndxK, mmC[2] );
-        mmC[4] = _mm_mullo_epi32(mmIndxJ, mmC[0]);
-        mmC[5] = _mm_mullo_epi32(mmIndxJ, mmC[2]);
-
-        // mmC[6-11] for iC[0-5] of 2nd row of pixels
-        mmC[6] = _mm_loadu_si128( (const __m128i*)&ppDerivate[0][idx2] );
-        mmC[8] = _mm_loadu_si128( (const __m128i*)&ppDerivate[1][idx2] );
-        mmC[7] = _mm_mullo_epi32( mmIndxK, mmC[6] );
-        mmC[9] = _mm_mullo_epi32( mmIndxK, mmC[8] );
-        mmC[10] = _mm_mullo_epi32(mmIndxJ, mmC[6]);
-        mmC[11] = _mm_mullo_epi32(mmIndxJ, mmC[8]);
-      }
-      else
+      for (int k = 0; k < width; k += 8)
       {
-        // mmC[0-3] for iC[0-3] of 1st row of pixels
-        mmC[0] = _mm_loadu_si128( (const __m128i*)&ppDerivate[0][idx1] );
-        mmC[2] = _mm_loadu_si128( (const __m128i*)&ppDerivate[1][idx1] );
-        mmC[1] = _mm_mullo_epi32( mmIndxK, mmC[0] );
-        mmC[3] = _mm_mullo_epi32(mmIndxJ, mmC[0]);
-        mmTmp[0] = _mm_mullo_epi32(mmIndxJ, mmC[2]);
-        mmTmp[1] = _mm_mullo_epi32( mmIndxK, mmC[2] );
-        mmC[1] = _mm_add_epi32( mmC[1], mmTmp[0] );
-        mmC[3] = _mm_sub_epi32( mmC[3], mmTmp[1] );
-
-        // mmC[4-7] for iC[0-3] of 1st row of pixels
-        mmC[4] = _mm_loadu_si128( (const __m128i*)&ppDerivate[0][idx2] );
-        mmC[6] = _mm_loadu_si128( (const __m128i*)&ppDerivate[1][idx2] );
-        mmC[5] = _mm_mullo_epi32( mmIndxK, mmC[4] );
-        mmC[7] = _mm_mullo_epi32(mmIndxJ, mmC[4]);
-        mmTmp[2] = _mm_mullo_epi32(mmIndxJ, mmC[6]);
-        mmTmp[3] = _mm_mullo_epi32( mmIndxK, mmC[6] );
-        mmC[5] = _mm_add_epi32( mmC[5], mmTmp[2] );
-        mmC[7] = _mm_sub_epi32( mmC[7], mmTmp[3] );
-      }
+        const ptrdiff_t idx1    = j * gradStride + k;
+        const __m256i   mmIndxK = _mm256_inserti128_si256(_mm256_set1_epi32(k + 2), _mm_set1_epi32(k + 6), 1);
+
+        __m256i mmC[6 + 1][2];
+
+        for (int k = 0; k < 2; k++)
+        {
+          mmC[0][k] = _mm256_loadu_si256((const __m256i*) &gradHor[idx1 + k * gradStride]);
+          mmC[2][k] = _mm256_loadu_si256((const __m256i*) &gradVer[idx1 + k * gradStride]);
+          mmC[1][k] = _mm256_mullo_epi32(mmIndxK, mmC[0][k]);
+          mmC[3][k] = _mm256_mullo_epi32(mmIndxK, mmC[2][k]);
+          mmC[4][k] = _mm256_mullo_epi32(mmIndxJ, mmC[0][k]);
+          mmC[5][k] = _mm256_mullo_epi32(mmIndxJ, mmC[2][k]);
+
+          if (!b6Param)
+          {
+            mmC[1][k] = _mm256_add_epi32(mmC[1][k], mmC[5][k]);
+            mmC[3][k] = _mm256_sub_epi32(mmC[4][k], mmC[3][k]);
+          }
 
-      // Residue
-      mmResidue[0] = _mm_loadl_epi64( (const __m128i*)&pResidue[idx1] );
-      mmResidue[1] = _mm_loadl_epi64( (const __m128i*)&pResidue[idx2] );
-      mmResidue[0] = _mm_cvtepi16_epi32( mmResidue[0] );
-      mmResidue[1] = _mm_cvtepi16_epi32( mmResidue[1] );
-      mmResidue[0] = _mm_slli_epi32( mmResidue[0], 3 );
-      mmResidue[1] = _mm_slli_epi32( mmResidue[1], 3 );
+          // Residue
+          if constexpr (sizeof(Pel) == 2)
+          {
+            mmC[n][k] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*) &target[idx1 + k * gradStride]));
+          }
+          else if constexpr (sizeof(Pel) == 4)
+          {
+            mmC[n][k] = _mm256_loadu_si256((const __m256i*) &target[idx1 + k * gradStride]);
+          }
+          mmC[n][k] = _mm256_slli_epi32(mmC[n][k], 3);
+        }
 
-      // Calculation of coefficient matrix
-      for ( int col = 0; col < n; col++ )
+        // Calculation of coefficient matrix
+        for (int col = 0; col < n; col++)
+        {
+          for (int row = col; row <= n; row++)
+          {
+            calcEqualCoeff16Pxls(mmC[col], mmC[row], (__m128i*) &cov[col + 1][row]);
+          }
+        }
+      }
+    }
+  }
+  else
+#endif
+  {
+    for (int j = 0; j < height; j += 2)
+    {
+      const __m128i mmIndxJ = _mm_set1_epi32(j | 2);
+
+      for (int k = 0; k < width; k += 4)
       {
-        mmTmp[0] = _mm_srli_si128( mmC[0 + col], 4 );
-        mmTmp[1] = _mm_srli_si128( mmC[n + col], 4 );
-        CALC_EQUAL_COEFF_8PXLS( mmC[0 + col], mmC[n + col], mmC[0 + col], mmC[n + col], mmTmp[0], mmTmp[1], mmTmp[0], mmTmp[1], mmIntermediate[0], mmIntermediate[1], mmIntermediate[2], mmIntermediate[3], (const __m128i*)&pEqualCoeff[col + 1][col] );
-        _mm_storel_epi64( (__m128i*)&pEqualCoeff[col + 1][col], mmIntermediate[3] );
+        const ptrdiff_t idx1    = j * gradStride + k;
+        const __m128i   mmIndxK = _mm_set1_epi32(k + 2);
+
+        __m128i mmC[6 + 1][2];
 
-        for ( int row = col + 1; row < n; row++ )
+        for (int k = 0; k < 2; k++)
         {
-          mmTmp[2] = _mm_srli_si128( mmC[0 + row], 4 );
-          mmTmp[3] = _mm_srli_si128( mmC[n + row], 4 );
-          CALC_EQUAL_COEFF_8PXLS( mmC[0 + col], mmC[n + col], mmC[0 + row], mmC[n + row], mmTmp[0], mmTmp[1], mmTmp[2], mmTmp[3], mmIntermediate[0], mmIntermediate[1], mmIntermediate[2], mmIntermediate[3], (const __m128i*)&pEqualCoeff[col + 1][row] );
-          _mm_storel_epi64( (__m128i*)&pEqualCoeff[col + 1][row], mmIntermediate[3] );
-          _mm_storel_epi64( (__m128i*)&pEqualCoeff[row + 1][col], mmIntermediate[3] );
+          mmC[0][k] = _mm_loadu_si128((const __m128i*) &gradHor[idx1 + k * gradStride]);
+          mmC[2][k] = _mm_loadu_si128((const __m128i*) &gradVer[idx1 + k * gradStride]);
+          mmC[1][k] = _mm_mullo_epi32(mmIndxK, mmC[0][k]);
+          mmC[3][k] = _mm_mullo_epi32(mmIndxK, mmC[2][k]);
+          mmC[4][k] = _mm_mullo_epi32(mmIndxJ, mmC[0][k]);
+          mmC[5][k] = _mm_mullo_epi32(mmIndxJ, mmC[2][k]);
+
+          if (!b6Param)
+          {
+            mmC[1][k] = _mm_add_epi32(mmC[1][k], mmC[5][k]);
+            mmC[3][k] = _mm_sub_epi32(mmC[4][k], mmC[3][k]);
+          }
+
+          // Residue
+          if constexpr (sizeof(Pel) == 2)
+          {
+            mmC[n][k] = _mm_cvtepi16_epi32(_mm_loadl_epi64((const __m128i*) &target[idx1 + k * gradStride]));
+          }
+          else if constexpr (sizeof(Pel) == 4)
+          {
+            mmC[n][k] = _mm_loadu_si128((const __m128i*) &target[idx1 + k * gradStride]);
+          }
+          mmC[n][k] = _mm_slli_epi32(mmC[n][k], 3);
         }
 
-        mmTmp[2] = _mm_srli_si128( mmResidue[0], 4 );
-        mmTmp[3] = _mm_srli_si128( mmResidue[1], 4 );
-        CALC_EQUAL_COEFF_8PXLS( mmC[0 + col], mmC[n + col], mmResidue[0], mmResidue[1], mmTmp[0], mmTmp[1], mmTmp[2], mmTmp[3], mmIntermediate[0], mmIntermediate[1], mmIntermediate[2], mmIntermediate[3], (const __m128i*)&pEqualCoeff[col + 1][n] );
-        _mm_storel_epi64( (__m128i*)&pEqualCoeff[col + 1][n], mmIntermediate[3] );
+        // Calculation of coefficient matrix
+        for (int col = 0; col < n; col++)
+        {
+          for (int row = col; row <= n; row++)
+          {
+            calcEqualCoeff8Pxls(mmC[col], mmC[row], (__m128i*) &cov[col + 1][row]);
+          }
+        }
       }
     }
+  }
 
-    idx1 -= (width);
-    idx2 -= (width);
+  for (int col = 0; col < n; col++)
+  {
+    for (int row = col + 1; row < n; row++)
+    {
+      cov[row + 1][col] = cov[col + 1][row];
+    }
   }
 }
+
 #if RExt__HIGH_BIT_DEPTH_SUPPORT
-template<X86_VEXT vext>
-static void simdHorizontalSobelFilter_HBD_SIMD(Pel *const pPred, const ptrdiff_t predStride, int *const pDerivate,
-                                               const ptrdiff_t derivateBufStride, const int width, const int height)
+#if USE_AVX2
+static inline void store6x32(int *ptr, __m256i val)
 {
-  __m128i pred[4];
-  __m128i pred2x[2];
-  __m128i intermediates[4];
-  __m128i derivate[2];
+  _mm_storeu_si128((__m128i *)ptr, _mm256_castsi256_si128(val));
+  _mm_storel_epi64((__m128i *)(ptr + 4), _mm256_extracti128_si256(val, 1));
+}
+#endif
 
+template<X86_VEXT vext> static void simdHorizontalSobelFilter_HBD_SIMD(Pel* const src, const ptrdiff_t srcStride,
+                                                                       int* const dst, const ptrdiff_t dstStride,
+                                                                       const int width, const int height)
+{
   assert(!(height % 2));
   assert(!(width % 4));
 
@@ -330,44 +419,35 @@ static void simdHorizontalSobelFilter_HBD_SIMD(Pel *const pPred, const ptrdiff_t
 #if USE_AVX2
   if (vext >= AVX2)
   {
-    __m256i pred256[4];
-    __m256i pred2x256[2];
-    __m256i intermediates256[4];
-    __m256i derivate256[2];
-
     for (; (col + 6) < width; col += 6)
     {
-      pred256[0] = _mm256_lddqu_si256((__m256i *)&pPred[col - 1]);
-      pred256[1] = _mm256_lddqu_si256((__m256i *)&pPred[predStride + col - 1]);
+      __m256i x[4];
+
+      x[0] = _mm256_lddqu_si256((__m256i*) &src[col - 1]);
+      x[1] = _mm256_lddqu_si256((__m256i*) &src[srcStride + col - 1]);
 
       for (int row = 1; row < (height - 1); row += 2)
       {
-        pred256[2] = _mm256_lddqu_si256((__m256i *)&pPred[(row + 1) * predStride + col - 1]);
-        pred256[3] = _mm256_lddqu_si256((__m256i *)&pPred[(row + 2) * predStride + col - 1]);
+        x[2] = _mm256_lddqu_si256((__m256i*) &src[(row + 1) * srcStride + col - 1]);
+        x[3] = _mm256_lddqu_si256((__m256i*) &src[(row + 2) * srcStride + col - 1]);
 
-        pred2x256[0] = _mm256_slli_epi32(pred256[1], 1);
-        pred2x256[1] = _mm256_slli_epi32(pred256[2], 1);
-
-        intermediates256[0] = _mm256_add_epi32(pred2x256[0], pred256[0]);
-        intermediates256[2] = _mm256_add_epi32(pred2x256[1], pred256[1]);
-
-        intermediates256[0] = _mm256_add_epi32(intermediates256[0], pred256[2]);
-        intermediates256[2] = _mm256_add_epi32(intermediates256[2], pred256[3]);
-
-        pred256[0] = pred256[2];
-        pred256[1] = pred256[3];
+        __m256i sum[3];
+        for (int i = 0; i < 3; i++)
+        {
+          sum[i] = _mm256_add_epi32(x[i], x[i + 1]);
+        }
 
-        intermediates256[1] = _mm256_permute4x64_epi64(intermediates256[0], 0xf9);
-        intermediates256[3] = _mm256_permute4x64_epi64(intermediates256[2], 0xf9);
+        __m256i r0 = _mm256_add_epi32(sum[0], sum[1]);
+        __m256i r1 = _mm256_add_epi32(sum[1], sum[2]);
 
-        derivate256[0] = _mm256_sub_epi32(intermediates256[1], intermediates256[0]);
-        derivate256[1] = _mm256_sub_epi32(intermediates256[3], intermediates256[2]);
+        r0 = _mm256_sub_epi32(_mm256_permute4x64_epi64(r0, 0x39), r0); // 00111001
+        r1 = _mm256_sub_epi32(_mm256_permute4x64_epi64(r1, 0x39), r1);
 
-        _mm_storeu_si128((__m128i *)&pDerivate[col + row * derivateBufStride], _mm256_castsi256_si128(derivate256[0]));
-        _mm_storel_epi64((__m128i *)&pDerivate[col + 4 + row * derivateBufStride], _mm256_castsi256_si128(_mm256_permute4x64_epi64(derivate256[0], 0xaa)));
+        store6x32(&dst[col + row * dstStride], r0);
+        store6x32(&dst[col + (row + 1) * dstStride], r1);
 
-        _mm_storeu_si128((__m128i *)&pDerivate[col + (row + 1) * derivateBufStride], _mm256_castsi256_si128(derivate256[1]));
-        _mm_storel_epi64((__m128i *)&pDerivate[col + 4 + (row + 1) * derivateBufStride], _mm256_castsi256_si128(_mm256_permute4x64_epi64(derivate256[1], 0xaa)));
+        x[0] = x[2];
+        x[1] = x[3];
       }
     }
   }
@@ -375,56 +455,49 @@ static void simdHorizontalSobelFilter_HBD_SIMD(Pel *const pPred, const ptrdiff_t
 
   for (; (col + 2) < width; col += 2)
   {
-    pred[0] = _mm_lddqu_si128((__m128i *)&pPred[col - 1]);
-    pred[1] = _mm_lddqu_si128((__m128i *)&pPred[predStride + col - 1]);
+    __m128i x[4];
+
+    x[0] = _mm_lddqu_si128((__m128i*) &src[col - 1]);
+    x[1] = _mm_lddqu_si128((__m128i*) &src[srcStride + col - 1]);
 
     for (int row = 1; row < (height - 1); row += 2)
     {
-      pred[2] = _mm_lddqu_si128((__m128i *)&pPred[(row + 1) * predStride + col - 1]);
-      pred[3] = _mm_lddqu_si128((__m128i *)&pPred[(row + 2) * predStride + col - 1]);
+      x[2] = _mm_lddqu_si128((__m128i*) &src[(row + 1) * srcStride + col - 1]);
+      x[3] = _mm_lddqu_si128((__m128i*) &src[(row + 2) * srcStride + col - 1]);
 
-      pred2x[0] = _mm_slli_epi32(pred[1], 1);
-      pred2x[1] = _mm_slli_epi32(pred[2], 1);
-
-      intermediates[0] = _mm_add_epi32(pred2x[0], pred[0]);
-      intermediates[2] = _mm_add_epi32(pred2x[1], pred[1]);
-
-      intermediates[0] = _mm_add_epi32(intermediates[0], pred[2]);
-      intermediates[2] = _mm_add_epi32(intermediates[2], pred[3]);
+      __m128i sum[3];
+      for (int i = 0; i < 3; i++)
+      {
+        sum[i] = _mm_add_epi32(x[i], x[i + 1]);
+      }
 
-      pred[0] = pred[2];
-      pred[1] = pred[3];
+      __m128i r0 = _mm_add_epi32(sum[0], sum[1]);
+      __m128i r1 = _mm_add_epi32(sum[1], sum[2]);
 
-      intermediates[1] = _mm_srli_si128(intermediates[0], 8);
-      intermediates[3] = _mm_srli_si128(intermediates[2], 8);
+      r0 = _mm_sub_epi32(_mm_srli_si128(r0, 8), r0);
+      r1 = _mm_sub_epi32(_mm_srli_si128(r1, 8), r1);
 
-      derivate[0] = _mm_sub_epi32(intermediates[1], intermediates[0]);
-      derivate[1] = _mm_sub_epi32(intermediates[3], intermediates[2]);
+      _mm_storel_epi64((__m128i*) &dst[col + row * dstStride], r0);
+      _mm_storel_epi64((__m128i*) &dst[col + (row + 1) * dstStride], r1);
 
-      _mm_storel_epi64((__m128i *)&pDerivate[col + row * derivateBufStride], derivate[0]);
-      _mm_storel_epi64((__m128i *)&pDerivate[col + (row + 1) * derivateBufStride], derivate[1]);
+      x[0] = x[2];
+      x[1] = x[3];
     }
   }
 
-  for (int j = 1; j < (height - 1); j++)
+  for (int j = 1; j < height - 1; j++)
   {
-    pDerivate[j * derivateBufStride] = pDerivate[j * derivateBufStride + 1];
-    pDerivate[j * derivateBufStride + (width - 1)] = pDerivate[j * derivateBufStride + (width - 2)];
+    dst[j * dstStride]               = dst[j * dstStride + 1];
+    dst[j * dstStride + (width - 1)] = dst[j * dstStride + (width - 2)];
   }
 
-  memcpy(pDerivate, pDerivate + derivateBufStride, width * sizeof(pDerivate[0]));
-  memcpy(pDerivate + (height - 1) * derivateBufStride, pDerivate + (height - 2) * derivateBufStride, width * sizeof(pDerivate[0])
-  );
+  sobelPadTopBottom(dst, dstStride, width, height);
 }
 
-template<X86_VEXT vext>
-static void simdVerticalSobelFilter_HBD_SIMD(Pel *const pPred, const ptrdiff_t predStride, int *const pDerivate,
-                                             const ptrdiff_t derivateBufStride, const int width, const int height)
+template<X86_VEXT vext> static void simdVerticalSobelFilter_HBD_SIMD(Pel* const src, const ptrdiff_t srcStride,
+                                                                     int* const dst, const ptrdiff_t dstStride,
+                                                                     const int width, const int height)
 {
-  __m128i pred[4];
-  __m128i intermediates[6];
-  __m128i derivate[2];
-
   assert(!(height % 2));
   assert(!(width % 4));
 
@@ -432,46 +505,34 @@ static void simdVerticalSobelFilter_HBD_SIMD(Pel *const pPred, const ptrdiff_t p
 #if USE_AVX2
   if (vext >= AVX2)
   {
-    __m256i pred256[4];
-    __m256i intermediates256[6];
-    __m256i derivate256[2];
-    __m256i shuffle256 = _mm256_set_epi32(0x00000007, 0x00000007, 0x00000006, 0x00000005, 0x00000004, 0x00000003, 0x00000002, 0x00000001);
+    const __m256i shuffle256 = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
 
     for (; (col + 6) < width; col += 6)
     {
-      pred256[0] = _mm256_lddqu_si256((__m256i *)&pPred[col - 1]);
-      pred256[1] = _mm256_lddqu_si256((__m256i *)&pPred[predStride + col - 1]);
+      __m256i x[4];
+
+      x[0] = _mm256_loadu_si256((__m256i*) &src[col - 1]);
+      x[1] = _mm256_loadu_si256((__m256i*) &src[srcStride + col - 1]);
 
       for (int row = 1; row < (height - 1); row += 2)
       {
-        pred256[2] = _mm256_lddqu_si256((__m256i *)&pPred[(row + 1) * predStride + col - 1]);
-        pred256[3] = _mm256_lddqu_si256((__m256i *)&pPred[(row + 2) * predStride + col - 1]);
-
-        intermediates256[0] = _mm256_sub_epi32(pred256[2], pred256[0]);
-        intermediates256[3] = _mm256_sub_epi32(pred256[3], pred256[1]);
+        x[2] = _mm256_loadu_si256((__m256i*) &src[(row + 1) * srcStride + col - 1]);
+        x[3] = _mm256_loadu_si256((__m256i*) &src[(row + 2) * srcStride + col - 1]);
 
-        pred256[0] = pred256[2];
-        pred256[1] = pred256[3];
+        __m256i r0 = _mm256_sub_epi32(x[2], x[0]);
+        __m256i r1 = _mm256_sub_epi32(x[3], x[1]);
 
-        intermediates256[1] = _mm256_permutevar8x32_epi32(intermediates256[0], shuffle256);
-        intermediates256[4] = _mm256_permutevar8x32_epi32(intermediates256[3], shuffle256);
-        intermediates256[2] = _mm256_permute4x64_epi64(intermediates256[0], 0xf9);
-        intermediates256[5] = _mm256_permute4x64_epi64(intermediates256[3], 0xf9);
+        r0 = _mm256_add_epi32(r0, _mm256_permutevar8x32_epi32(r0, shuffle256));
+        r1 = _mm256_add_epi32(r1, _mm256_permutevar8x32_epi32(r1, shuffle256));
 
-        intermediates256[1] = _mm256_slli_epi32(intermediates256[1], 1);
-        intermediates256[4] = _mm256_slli_epi32(intermediates256[4], 1);
+        r0 = _mm256_add_epi32(r0, _mm256_permutevar8x32_epi32(r0, shuffle256));
+        r1 = _mm256_add_epi32(r1, _mm256_permutevar8x32_epi32(r1, shuffle256));
 
-        intermediates256[0] = _mm256_add_epi32(intermediates256[0], intermediates256[2]);
-        intermediates256[3] = _mm256_add_epi32(intermediates256[3], intermediates256[5]);
+        store6x32(&dst[col + row * dstStride], r0);
+        store6x32(&dst[col + (row + 1) * dstStride], r1);
 
-        derivate256[0] = _mm256_add_epi32(intermediates256[0], intermediates256[1]);
-        derivate256[1] = _mm256_add_epi32(intermediates256[3], intermediates256[4]);
-
-        _mm_storeu_si128((__m128i *)&pDerivate[col + row * derivateBufStride], _mm256_castsi256_si128(derivate256[0]));
-        _mm_storel_epi64((__m128i *)&pDerivate[col + 4 + row * derivateBufStride], _mm256_castsi256_si128(_mm256_permute4x64_epi64(derivate256[0], 0xaa)));
-
-        _mm_storeu_si128((__m128i *)&pDerivate[col + (row + 1) * derivateBufStride], _mm256_castsi256_si128(derivate256[1]));
-        _mm_storel_epi64((__m128i *)&pDerivate[col + 4 + (row + 1) * derivateBufStride], _mm256_castsi256_si128(_mm256_permute4x64_epi64(derivate256[1], 0xaa)));
+        x[0] = x[2];
+        x[1] = x[3];
       }
     }
   }
@@ -481,286 +542,40 @@ static void simdVerticalSobelFilter_HBD_SIMD(Pel *const pPred, const ptrdiff_t p
   /* The value of col and row indicate the columns and rows for which the derivates have already been computed */
   for (; (col + 2) < width; col += 2)
   {
-    pred[0] = _mm_lddqu_si128((__m128i *)&pPred[col - 1]);
-    pred[1] = _mm_lddqu_si128((__m128i *)&pPred[predStride + col - 1]);
+    __m128i x[4];
 
-    for (int row = 1; row < (height - 1); row += 2)
-    {
-      pred[2] = _mm_lddqu_si128((__m128i *)&pPred[(row + 1) * predStride + col - 1]);
-      pred[3] = _mm_lddqu_si128((__m128i *)&pPred[(row + 2) * predStride + col - 1]);
+    x[0] = _mm_loadu_si128((__m128i*) &src[col - 1]);
+    x[1] = _mm_loadu_si128((__m128i*) &src[srcStride + col - 1]);
 
-      intermediates[0] = _mm_sub_epi32(pred[2], pred[0]);
-      intermediates[3] = _mm_sub_epi32(pred[3], pred[1]);
-
-      pred[0] = pred[2];
-      pred[1] = pred[3];
+    for (int row = 1; row < height - 1; row += 2)
+    {
+      x[2] = _mm_loadu_si128((__m128i*) &src[(row + 1) * srcStride + col - 1]);
+      x[3] = _mm_loadu_si128((__m128i*) &src[(row + 2) * srcStride + col - 1]);
 
-      intermediates[1] = _mm_srli_si128(intermediates[0], 4);
-      intermediates[4] = _mm_srli_si128(intermediates[3], 4);
-      intermediates[2] = _mm_srli_si128(intermediates[0], 8);
-      intermediates[5] = _mm_srli_si128(intermediates[3], 8);
+      __m128i r0 = _mm_sub_epi32(x[2], x[0]);
+      __m128i r1 = _mm_sub_epi32(x[3], x[1]);
 
-      intermediates[1] = _mm_slli_epi32(intermediates[1], 1);
-      intermediates[4] = _mm_slli_epi32(intermediates[4], 1);
+      r0 = _mm_add_epi32(r0, _mm_srli_si128(r0, 4));
+      r1 = _mm_add_epi32(r1, _mm_srli_si128(r1, 4));
 
-      intermediates[0] = _mm_add_epi32(intermediates[0], intermediates[2]);
-      intermediates[3] = _mm_add_epi32(intermediates[3], intermediates[5]);
+      r0 = _mm_add_epi32(r0, _mm_srli_si128(r0, 4));
+      r1 = _mm_add_epi32(r1, _mm_srli_si128(r1, 4));
 
-      derivate[0] = _mm_add_epi32(intermediates[0], intermediates[1]);
-      derivate[1] = _mm_add_epi32(intermediates[3], intermediates[4]);
+      _mm_storel_epi64((__m128i*) &dst[col + row * dstStride], r0);
+      _mm_storel_epi64((__m128i*) &dst[col + (row + 1) * dstStride], r1);
 
-      _mm_storel_epi64((__m128i *)&pDerivate[col + row * derivateBufStride], derivate[0]);
-      _mm_storel_epi64((__m128i *)&pDerivate[col + (row + 1) * derivateBufStride], derivate[1]);
+      x[0] = x[2];
+      x[1] = x[3];
     }
   }
 
-  for (int j = 1; j < (height - 1); j++)
+  for (int j = 1; j < height - 1; j++)
   {
-    pDerivate[j * derivateBufStride] = pDerivate[j * derivateBufStride + 1];
-    pDerivate[j * derivateBufStride + (width - 1)] = pDerivate[j * derivateBufStride + (width - 2)];
-  }
-
-  memcpy(pDerivate, pDerivate + derivateBufStride, width * sizeof(pDerivate[0]));
-  memcpy(pDerivate + (height - 1) * derivateBufStride, pDerivate + (height - 2) * derivateBufStride, width * sizeof(pDerivate[0]));
-}
-
-#define CALC_EQUAL_COEFF_16PXLS(x1,x2,y1,y2,tmp0,tmp1,tmp2,tmp3,inter0,inter1,inter2,inter3,loadLocation)         \
-{                                                                                                                 \
-inter0 = _mm256_mul_epi32(x1, y1);                                                                                \
-inter1 = _mm256_mul_epi32(tmp0, tmp2);                                                                            \
-inter2 = _mm256_mul_epi32(x2, y2);                                                                                \
-inter3 = _mm256_mul_epi32(tmp1, tmp3);                                                                            \
-inter2 = _mm256_add_epi64(inter0, inter2);                                                                        \
-inter3 = _mm256_add_epi64(inter1, inter3);                                                                        \
-inter0 = _mm256_castsi128_si256(_mm_loadl_epi64(loadLocation));                                                   \
-inter3 = _mm256_add_epi64(inter2, inter3);                                                                        \
-inter1 = _mm256_permute4x64_epi64(inter3, 0x4e);                                                                  \
-inter3 = _mm256_add_epi64(inter1, inter3);                                                                        \
-inter1 = _mm256_permute4x64_epi64(inter3, 0xb1);                                                                  \
-inter3 = _mm256_add_epi64(inter1, inter3);                                                                        \
-inter3 = _mm256_add_epi64(inter0, inter3);                                                                        \
-}
-
-template<X86_VEXT vext>
-static void simdEqualCoeffComputer_HBD_SIMD(Pel *pResidue, ptrdiff_t residueStride, int **ppDerivate,
-                                            ptrdiff_t derivateBufStride, int64_t (*pEqualCoeff)[7], int width,
-                                            int height, bool b6Param)
-{
-  int n = b6Param ? 6 : 4;
-  CHECK((width & 8), "width of affine block should be multiple of 8");
-
-#if USE_AVX2
-  if (vext >= AVX2)
-  {
-    ptrdiff_t idx1 = -2 * derivateBufStride - 8;
-    ptrdiff_t idx2 = -derivateBufStride - 8;
-
-    __m256i tmp[4];
-    __m256i intermediate[4];
-    __m256i residue[2];
-    __m256i coef[12];
-
-    // Add directly to indexes to get new index
-    __m256i four = _mm256_set1_epi32(4);
-    __m256i eight = _mm256_set1_epi32(8);
-    __m256i shuffle = _mm256_set_epi32(0x00000007, 0x00000007, 0x00000006, 0x00000005, 0x00000004, 0x00000003, 0x00000002, 0x00000001);
-    __m256i indxJ = _mm256_set1_epi32(-2);
-
-    for (int j = 0; j < height; j += 2)
-    {
-      if (!(j & 3))
-        indxJ = _mm256_add_epi32(indxJ, four);
-      __m256i indxK = _mm256_set_epi32(-2, -2, -2, -2, -6, -6, -6, -6);
-      idx1 += (derivateBufStride << 1);
-      idx2 += (derivateBufStride << 1);
-
-      for (int k = 0; k < width; k += 8)
-      {
-        idx1 += 8;
-        idx2 += 8;
-        indxK = _mm256_add_epi32(indxK, eight);
-
-        if (b6Param)
-        {
-          // coef[0-5] for iC[0-5] of 1st row of pixels
-          coef[0] = _mm256_lddqu_si256((__m256i *)&ppDerivate[0][idx1]);
-          coef[2] = _mm256_lddqu_si256((__m256i *)&ppDerivate[1][idx1]);
-          coef[1] = _mm256_mullo_epi32(indxK, coef[0]);
-          coef[3] = _mm256_mullo_epi32(indxK, coef[2]);
-          coef[4] = _mm256_mullo_epi32(indxJ, coef[0]);
-          coef[5] = _mm256_mullo_epi32(indxJ, coef[2]);
-
-          // coef[6-11] for iC[0-5] of 2nd row of pixels
-          coef[6] = _mm256_lddqu_si256((__m256i *)&ppDerivate[0][idx2]);
-          coef[8] = _mm256_lddqu_si256((__m256i *)&ppDerivate[1][idx2]);
-          coef[7] = _mm256_mullo_epi32(indxK, coef[6]);
-          coef[9] = _mm256_mullo_epi32(indxK, coef[8]);
-          coef[10] = _mm256_mullo_epi32(indxJ, coef[6]);
-          coef[11] = _mm256_mullo_epi32(indxJ, coef[8]);
-        }
-        else
-        {
-          // coef[0-3] for iC[0-3] of 1st row of pixels
-          coef[0] = _mm256_lddqu_si256((__m256i *)&ppDerivate[0][idx1]);
-          coef[2] = _mm256_lddqu_si256((__m256i *)&ppDerivate[1][idx1]);
-          coef[1] = _mm256_mullo_epi32(indxK, coef[0]);
-          coef[3] = _mm256_mullo_epi32(indxJ, coef[0]);
-          tmp[0] = _mm256_mullo_epi32(indxJ, coef[2]);
-          tmp[1] = _mm256_mullo_epi32(indxK, coef[2]);
-          coef[1] = _mm256_add_epi32(coef[1], tmp[0]);
-          coef[3] = _mm256_sub_epi32(coef[3], tmp[1]);
-
-          // coef[4-7] for iC[0-3] of 1st row of pixels
-          coef[4] = _mm256_lddqu_si256((__m256i *)&ppDerivate[0][idx2]);
-          coef[6] = _mm256_lddqu_si256((__m256i *)&ppDerivate[1][idx2]);
-          coef[5] = _mm256_mullo_epi32(indxK, coef[4]);
-          coef[7] = _mm256_mullo_epi32(indxJ, coef[4]);
-          tmp[2] = _mm256_mullo_epi32(indxJ, coef[6]);
-          tmp[3] = _mm256_mullo_epi32(indxK, coef[6]);
-          coef[5] = _mm256_add_epi32(coef[5], tmp[2]);
-          coef[7] = _mm256_sub_epi32(coef[7], tmp[3]);
-        }
-
-        // Residue
-        residue[0] = _mm256_lddqu_si256((__m256i *)&pResidue[idx1]);
-        residue[1] = _mm256_lddqu_si256((__m256i *)&pResidue[idx2]);
-        residue[0] = _mm256_slli_epi32(residue[0], 3);
-        residue[1] = _mm256_slli_epi32(residue[1], 3);
-
-        // Calculation of coefficient matrix
-        for (int col = 0; col < n; col++)
-        {
-          tmp[0] = _mm256_permutevar8x32_epi32(coef[0 + col], shuffle);
-          tmp[1] = _mm256_permutevar8x32_epi32(coef[n + col], shuffle);
-          CALC_EQUAL_COEFF_16PXLS(coef[0 + col], coef[n + col], coef[0 + col], coef[n + col], tmp[0], tmp[1], tmp[0], tmp[1], intermediate[0], intermediate[1], intermediate[2], intermediate[3], (const __m128i*)&pEqualCoeff[col + 1][col]);
-          _mm_storel_epi64((__m128i*)&pEqualCoeff[col + 1][col], _mm256_castsi256_si128(intermediate[3]));
-
-          for (int row = col + 1; row < n; row++)
-          {
-            tmp[2] = _mm256_permutevar8x32_epi32(coef[0 + row], shuffle);
-            tmp[3] = _mm256_permutevar8x32_epi32(coef[n + row], shuffle);
-            CALC_EQUAL_COEFF_16PXLS(coef[0 + col], coef[n + col], coef[0 + row], coef[n + row], tmp[0], tmp[1], tmp[2], tmp[3], intermediate[0], intermediate[1], intermediate[2], intermediate[3], (const __m128i*)&pEqualCoeff[col + 1][row]);
-            _mm_storel_epi64((__m128i*)&pEqualCoeff[col + 1][row], _mm256_castsi256_si128(intermediate[3]));
-            _mm_storel_epi64((__m128i*)&pEqualCoeff[row + 1][col], _mm256_castsi256_si128(intermediate[3]));
-          }
-
-          tmp[2] = _mm256_permutevar8x32_epi32(residue[0], shuffle);
-          tmp[3] = _mm256_permutevar8x32_epi32(residue[1], shuffle);
-          CALC_EQUAL_COEFF_16PXLS(coef[0 + col], coef[n + col], residue[0], residue[1], tmp[0], tmp[1], tmp[2], tmp[3], intermediate[0], intermediate[1], intermediate[2], intermediate[3], (const __m128i*)&pEqualCoeff[col + 1][n]);
-          _mm_storel_epi64((__m128i*)&pEqualCoeff[col + 1][n], _mm256_castsi256_si128(intermediate[3]));
-        }
-      }
-
-      idx1 -= (width);
-      idx2 -= (width);
-    }
+    dst[j * dstStride]               = dst[j * dstStride + 1];
+    dst[j * dstStride + (width - 1)] = dst[j * dstStride + (width - 2)];
   }
-  else
-#endif
-  {
-    ptrdiff_t idx1 = -2 * derivateBufStride - 4;
-    ptrdiff_t idx2 = -derivateBufStride - 4;
-
-    __m128i four;
-    __m128i tmp[4];
-    __m128i intermediate[4];
-    __m128i indxK, indxJ;
-    __m128i residue[2];
-    __m128i coef[12];
 
-    // Add directly to indexes to get new index
-    four = _mm_set1_epi32(4);
-    indxJ = _mm_set1_epi32(-2);
-
-    for (int j = 0; j < height; j += 2)
-    {
-      if (!(j & 3))
-        indxJ = _mm_add_epi32(indxJ, four);
-      indxK = _mm_set1_epi32(-2);
-      idx1 += (derivateBufStride << 1);
-      idx2 += (derivateBufStride << 1);
-
-      for (int k = 0; k < width; k += 4)
-      {
-        idx1 += 4;
-        idx2 += 4;
-        indxK = _mm_add_epi32(indxK, four);
-
-        if (b6Param)
-        {
-          // coef[0-5] for iC[0-5] of 1st row of pixels
-          coef[0] = _mm_lddqu_si128((const __m128i*)&ppDerivate[0][idx1]);
-          coef[2] = _mm_lddqu_si128((const __m128i*)&ppDerivate[1][idx1]);
-          coef[1] = _mm_mullo_epi32(indxK, coef[0]);
-          coef[3] = _mm_mullo_epi32(indxK, coef[2]);
-          coef[4] = _mm_mullo_epi32(indxJ, coef[0]);
-          coef[5] = _mm_mullo_epi32(indxJ, coef[2]);
-
-          // coef[6-11] for iC[0-5] of 2nd row of pixels
-          coef[6] = _mm_lddqu_si128((const __m128i*)&ppDerivate[0][idx2]);
-          coef[8] = _mm_lddqu_si128((const __m128i*)&ppDerivate[1][idx2]);
-          coef[7] = _mm_mullo_epi32(indxK, coef[6]);
-          coef[9] = _mm_mullo_epi32(indxK, coef[8]);
-          coef[10] = _mm_mullo_epi32(indxJ, coef[6]);
-          coef[11] = _mm_mullo_epi32(indxJ, coef[8]);
-        }
-        else
-        {
-          // coef[0-3] for iC[0-3] of 1st row of pixels
-          coef[0] = _mm_lddqu_si128((const __m128i*)&ppDerivate[0][idx1]);
-          coef[2] = _mm_lddqu_si128((const __m128i*)&ppDerivate[1][idx1]);
-          coef[1] = _mm_mullo_epi32(indxK, coef[0]);
-          coef[3] = _mm_mullo_epi32(indxJ, coef[0]);
-          tmp[0] = _mm_mullo_epi32(indxJ, coef[2]);
-          tmp[1] = _mm_mullo_epi32(indxK, coef[2]);
-          coef[1] = _mm_add_epi32(coef[1], tmp[0]);
-          coef[3] = _mm_sub_epi32(coef[3], tmp[1]);
-
-          // coef[4-7] for iC[0-3] of 1st row of pixels
-          coef[4] = _mm_lddqu_si128((const __m128i*)&ppDerivate[0][idx2]);
-          coef[6] = _mm_lddqu_si128((const __m128i*)&ppDerivate[1][idx2]);
-          coef[5] = _mm_mullo_epi32(indxK, coef[4]);
-          coef[7] = _mm_mullo_epi32(indxJ, coef[4]);
-          tmp[2] = _mm_mullo_epi32(indxJ, coef[6]);
-          tmp[3] = _mm_mullo_epi32(indxK, coef[6]);
-          coef[5] = _mm_add_epi32(coef[5], tmp[2]);
-          coef[7] = _mm_sub_epi32(coef[7], tmp[3]);
-        }
-
-        // Residue
-        residue[0] = _mm_lddqu_si128((__m128i *)&pResidue[idx1]);
-        residue[1] = _mm_lddqu_si128((__m128i *)&pResidue[idx2]);
-        residue[0] = _mm_slli_epi32(residue[0], 3);
-        residue[1] = _mm_slli_epi32(residue[1], 3);
-
-        // Calculation of coefficient matrix
-        for (int col = 0; col < n; col++)
-        {
-          tmp[0] = _mm_srli_si128(coef[0 + col], 4);
-          tmp[1] = _mm_srli_si128(coef[n + col], 4);
-          CALC_EQUAL_COEFF_8PXLS(coef[0 + col], coef[n + col], coef[0 + col], coef[n + col], tmp[0], tmp[1], tmp[0], tmp[1], intermediate[0], intermediate[1], intermediate[2], intermediate[3], (const __m128i*)&pEqualCoeff[col + 1][col]);
-          _mm_storel_epi64((__m128i*)&pEqualCoeff[col + 1][col], intermediate[3]);
-
-          for (int row = col + 1; row < n; row++)
-          {
-            tmp[2] = _mm_srli_si128(coef[0 + row], 4);
-            tmp[3] = _mm_srli_si128(coef[n + row], 4);
-            CALC_EQUAL_COEFF_8PXLS(coef[0 + col], coef[n + col], coef[0 + row], coef[n + row], tmp[0], tmp[1], tmp[2], tmp[3], intermediate[0], intermediate[1], intermediate[2], intermediate[3], (const __m128i*)&pEqualCoeff[col + 1][row]);
-            _mm_storel_epi64((__m128i*)&pEqualCoeff[col + 1][row], intermediate[3]);
-            _mm_storel_epi64((__m128i*)&pEqualCoeff[row + 1][col], intermediate[3]);
-          }
-
-          tmp[2] = _mm_srli_si128(residue[0], 4);
-          tmp[3] = _mm_srli_si128(residue[1], 4);
-          CALC_EQUAL_COEFF_8PXLS(coef[0 + col], coef[n + col], residue[0], residue[1], tmp[0], tmp[1], tmp[2], tmp[3], intermediate[0], intermediate[1], intermediate[2], intermediate[3], (const __m128i*)&pEqualCoeff[col + 1][n]);
-          _mm_storel_epi64((__m128i*)&pEqualCoeff[col + 1][n], intermediate[3]);
-        }
-      }
-
-      idx1 -= (width);
-      idx2 -= (width);
-    }
-  }
+  sobelPadTopBottom(dst, dstStride, width, height);
 }
 #endif
 
@@ -769,16 +584,14 @@ void AffineGradientSearch::_initAffineGradientSearchX86()
 {
 #if RExt__HIGH_BIT_DEPTH_SUPPORT
   m_HorizontalSobelFilter = simdHorizontalSobelFilter_HBD_SIMD<vext>;
-  m_VerticalSobelFilter = simdVerticalSobelFilter_HBD_SIMD<vext>;
-  m_EqualCoeffComputer = simdEqualCoeffComputer_HBD_SIMD<vext>;
+  m_VerticalSobelFilter   = simdVerticalSobelFilter_HBD_SIMD<vext>;
 #else
   m_HorizontalSobelFilter = simdHorizontalSobelFilter<vext>;
-  m_VerticalSobelFilter = simdVerticalSobelFilter<vext>;
-  m_EqualCoeffComputer = simdEqualCoeffComputer<vext>;
+  m_VerticalSobelFilter   = simdVerticalSobelFilter<vext>;
 #endif
+  m_EqualCoeffComputer = simdEqualCoeffComputer<vext>;
 }
 
 template void AffineGradientSearch::_initAffineGradientSearchX86<SIMDX86>();
 
 #endif //#ifdef TARGET_SIMD_X86
-//! \}
-- 
GitLab