diff --git a/source/Lib/CommonLib/x86/BufferX86.h b/source/Lib/CommonLib/x86/BufferX86.h
index edf9cac3cde7c7dc6399c222202d71e11dbcc2e5..20b0b254ff3be3e9a7126d3011891745f4dcf9a3 100644
--- a/source/Lib/CommonLib/x86/BufferX86.h
+++ b/source/Lib/CommonLib/x86/BufferX86.h
@@ -538,7 +538,7 @@ template< X86_VEXT vext >
 void avg_SSE( const int16_t* src1, int src1Stride, const int16_t* src2, int src2Stride, int16_t *dst, int dstStride, int width, int height)
 {
 #ifdef USE_AVX2
-  if( !( width % 16 ) )
+  if( !( width & 15 ) )
   {
     for( int y = 0; y < height; y++ )
     {
@@ -557,7 +557,7 @@ void avg_SSE( const int16_t* src1, int src1Stride, const int16_t* src2, int src2
   else
 #endif
   {
-    if( !( width % 8 ) )
+    if( !( width & 7 ) )
     {
       for( int y = 0; y < height; y++ )
       {
@@ -573,7 +573,7 @@ void avg_SSE( const int16_t* src1, int src1Stride, const int16_t* src2, int src2
         dst += dstStride;
       }
     }
-    else if( !( width % 4 ) )
+    else if( !( width & 3 ) )
     {
       for( int y = 0; y < height; y++ )
       {
@@ -628,8 +628,11 @@ void copyBufferSimd(Pel *src, int srcStride, Pel *dst, int dstStride, int width,
   {
     for (size_t x = 0; x < width; x += 8)
     {
-      if (x > width - 8)
+      if( x > width - 8 )
+      {
         x = width - 8;
+      }
+
       for (size_t y = 0; y < height; y++)
       {
         __m128i val = _mm_loadu_si128((const __m128i *) (src + y * srcStride + x));
@@ -2762,82 +2765,83 @@ int64_t getSumOfDifference_SSE(const Pel* src0, int src0Stride, const Pel* src1,
 
   // internal bit-depth must be 12-bit or lower
 
-  if (width & 7) // multiple of 4
+#ifdef USE_AVX2
+  if( vext >= AVX2 && !( width & 15 ) ) // multiple of 16
   {
-    __m128i vzero = _mm_setzero_si128();
-    __m128i vsum32 = vzero;
+    __m256i vzero = _mm256_setzero_si256();
+    __m256i vsum32 = vzero;
 
-    for (; height != 0; height -= subStep)
+    for( int m = 0; m < height; m += subStep )
     {
-      __m128i vsum16 = vzero;
+      __m256i vsum16 = vzero;
 
-      for (int n = 0; n < width; n += 4)
+      for( int n = 0; n < width; n += 16 )
       {
-        __m128i org = _mm_loadl_epi64((__m128i*)(pOrg + n));
-        __m128i cur = _mm_loadl_epi64((__m128i*)(pCur + n));
-        vsum16 = _mm_adds_epi16(vsum16, _mm_sub_epi16(org, cur));
+        __m256i org = _mm256_lddqu_si256( ( __m256i* )( pOrg + n ) );
+        __m256i cur = _mm256_lddqu_si256( ( __m256i* )( pCur + n ) );
+        vsum16 = _mm256_adds_epi16( vsum16, _mm256_sub_epi16( org, cur ) );
       }
 
-      __m128i vsign = _mm_cmpgt_epi16(vzero, vsum16);
-      vsum32 = _mm_add_epi32(vsum32, _mm_unpacklo_epi16(vsum16, vsign));
+      __m256i vsign = _mm256_cmpgt_epi16( vzero, vsum16 );
+      __m256i vsumtemp = _mm256_add_epi32( _mm256_unpacklo_epi16( vsum16, vsign ), _mm256_unpackhi_epi16( vsum16, vsign ) );
+      vsum32 = _mm256_add_epi32( vsum32, vsumtemp );
 
       pOrg += strideOrg;
       pCur += strideCur;
     }
 
-    vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e));   // 01001110
-    vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1));   // 10110001
-    deltaAvg = _mm_cvtsi128_si32(vsum32);
+    vsum32 = _mm256_hadd_epi32( vsum32, vzero );
+    vsum32 = _mm256_hadd_epi32( vsum32, vzero );
+    deltaAvg = _mm_cvtsi128_si32( _mm256_castsi256_si128( vsum32 ) ) + _mm_cvtsi128_si32( _mm256_castsi256_si128( _mm256_permute2x128_si256( vsum32, vsum32, 0x11 ) ) );
   }
-#ifdef USE_AVX2
-  else if (vext >= AVX2 && width >= 16) // multiple of 16
+  else
+#endif
+  if( !( width & 7 ) )// multiple of 8
   {
-    __m256i vzero = _mm256_setzero_si256();
-    __m256i vsum32 = vzero;
+    __m128i vzero = _mm_setzero_si128();
+    __m128i vsum32 = vzero;
 
-    for (; height != 0; height -= subStep)
+    for( int m = 0; m < height; m += subStep )
     {
-      __m256i vsum16 = vzero;
+      __m128i vsum16 = vzero;
 
-      for (int n = 0; n < width; n += 16)
+      for( int n = 0; n < width; n += 8 )
       {
-        __m256i org = _mm256_lddqu_si256((__m256i*)(pOrg + n));
-        __m256i cur = _mm256_lddqu_si256((__m256i*)(pCur + n));
-        vsum16 = _mm256_adds_epi16(vsum16, _mm256_sub_epi16(org, cur));
+        __m128i org = _mm_lddqu_si128( ( __m128i* )( pOrg + n ) );
+        __m128i cur = _mm_lddqu_si128( ( __m128i* )( pCur + n ) );
+        vsum16 = _mm_adds_epi16( vsum16, _mm_sub_epi16( org, cur ) );
       }
 
-      __m256i vsign = _mm256_cmpgt_epi16(vzero, vsum16);
-      __m256i vsumtemp = _mm256_add_epi32(_mm256_unpacklo_epi16(vsum16, vsign), _mm256_unpackhi_epi16(vsum16, vsign));
-      vsum32 = _mm256_add_epi32(vsum32, vsumtemp);
+      __m128i vsign = _mm_cmpgt_epi16( vzero, vsum16 );
+      __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vsign ), _mm_unpackhi_epi16( vsum16, vsign ) );
+      vsum32 = _mm_add_epi32( vsum32, vsumtemp );
 
       pOrg += strideOrg;
       pCur += strideCur;
     }
 
-    vsum32 = _mm256_hadd_epi32(vsum32, vzero);
-    vsum32 = _mm256_hadd_epi32(vsum32, vzero);
-    deltaAvg = _mm_cvtsi128_si32(_mm256_castsi256_si128(vsum32)) + _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_permute2x128_si256(vsum32, vsum32, 0x11)));
+    vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0x4e ) );   // 01001110
+    vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0xb1 ) );   // 10110001
+    deltaAvg = _mm_cvtsi128_si32( vsum32 );
   }
-#endif
-  else // multiple of 8
+  else if( !( width & 3 ) ) // multiple of 4
   {
     __m128i vzero = _mm_setzero_si128();
     __m128i vsum32 = vzero;
 
-    for (; height != 0; height -= subStep)
+    for( int m = 0; m < height; m += subStep )
     {
       __m128i vsum16 = vzero;
 
-      for (int n = 0; n < width; n += 8)
+      for (int n = 0; n < width; n += 4)
       {
-        __m128i org = _mm_lddqu_si128((__m128i*)(pOrg + n));
-        __m128i cur = _mm_lddqu_si128((__m128i*)(pCur + n));
+        __m128i org = _mm_loadl_epi64((__m128i*)(pOrg + n));
+        __m128i cur = _mm_loadl_epi64((__m128i*)(pCur + n));
         vsum16 = _mm_adds_epi16(vsum16, _mm_sub_epi16(org, cur));
       }
 
       __m128i vsign = _mm_cmpgt_epi16(vzero, vsum16);
-      __m128i vsumtemp = _mm_add_epi32(_mm_unpacklo_epi16(vsum16, vsign), _mm_unpackhi_epi16(vsum16, vsign));
-      vsum32 = _mm_add_epi32(vsum32, vsumtemp);
+      vsum32 = _mm_add_epi32(vsum32, _mm_unpacklo_epi16(vsum16, vsign));
 
       pOrg += strideOrg;
       pCur += strideCur;
@@ -2847,6 +2851,10 @@ int64_t getSumOfDifference_SSE(const Pel* src0, int src0Stride, const Pel* src1,
     vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1));   // 10110001
     deltaAvg = _mm_cvtsi128_si32(vsum32);
   }
+  else
+  {
+    return getSumOfDifferenceCore( src0, src0Stride, src1, src1Stride, width, height, rowSubShift, bitDepth );
+  }
 
   deltaAvg <<= subShift;
   return deltaAvg;
diff --git a/source/Lib/CommonLib/x86/RdCostX86.h b/source/Lib/CommonLib/x86/RdCostX86.h
index 62ba542a229c20ebaa250f269c7a9a83896a566e..088b60a8829b849b3bb7a0e7ccac4aad061d9e6b 100644
--- a/source/Lib/CommonLib/x86/RdCostX86.h
+++ b/source/Lib/CommonLib/x86/RdCostX86.h
@@ -2678,184 +2678,191 @@ Distortion RdCost::xGetMRSAD_SIMD(const DistParam &rcDtParam)
   uint32_t sum = 0;
 
   // internal bit-depth must be 12-bit or lower
-
-  if (width & 7) // multiple of 4
+#ifdef USE_AVX2
+  if( vext >= AVX2 && !( width & 15 ) ) // multiple of 16
   {
-    __m128i vzero = _mm_setzero_si128();
-    __m128i vsum32 = vzero;
+    __m256i vzero = _mm256_setzero_si256();
+    __m256i vsum32 = vzero;
 
-    for (; height != 0; height -= subStep)
+    for( int m = 0; m < height; m += subStep )
     {
-      __m128i vsum16 = vzero;
+      __m256i vsum16 = vzero;
 
-      for (int n = 0; n < width; n += 4)
+      for( int n = 0; n < width; n += 16 )
       {
-        __m128i org = _mm_loadl_epi64((__m128i*)(pOrg + n));
-        __m128i cur = _mm_loadl_epi64((__m128i*)(pCur + n));
-        vsum16 = _mm_adds_epi16(vsum16, _mm_sub_epi16(org, cur));
+        __m256i org = _mm256_lddqu_si256( ( __m256i* )( pOrg + n ) );
+        __m256i cur = _mm256_lddqu_si256( ( __m256i* )( pCur + n ) );
+        vsum16 = _mm256_adds_epi16( vsum16, _mm256_sub_epi16( org, cur ) );
       }
 
-      __m128i vsign = _mm_cmpgt_epi16(vzero, vsum16);
-      vsum32 = _mm_add_epi32(vsum32, _mm_unpacklo_epi16(vsum16, vsign));
+      __m256i vsign = _mm256_cmpgt_epi16( vzero, vsum16 );
+      __m256i vsumtemp = _mm256_add_epi32( _mm256_unpacklo_epi16( vsum16, vsign ), _mm256_unpackhi_epi16( vsum16, vsign ) );
+      vsum32 = _mm256_add_epi32( vsum32, vsumtemp );
 
       pOrg += strideOrg;
       pCur += strideCur;
       rowCnt++;
     }
 
-    vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e));   // 01001110
-    vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1));   // 10110001
-    deltaAvg = _mm_cvtsi128_si32(vsum32) / (width * rowCnt);
+    vsum32 = _mm256_hadd_epi32( vsum32, vzero );
+    vsum32 = _mm256_hadd_epi32( vsum32, vzero );
+    deltaAvg = _mm_cvtsi128_si32( _mm256_castsi256_si128( vsum32 ) ) + _mm_cvtsi128_si32( _mm256_castsi256_si128( _mm256_permute2x128_si256( vsum32, vsum32, 0x11 ) ) );
+    deltaAvg /= width * rowCnt;
 
     pOrg = (const short*)rcDtParam.org.buf;
     pCur = (const short*)rcDtParam.cur.buf;
-    height = rcDtParam.org.height;
 
-    __m128i delta = _mm_set1_epi16(deltaAvg);
+    __m256i delta = _mm256_set1_epi16( deltaAvg );
     vsum32 = vzero;
 
-    for (; height != 0; height -= subStep)
+    for( int m = 0; m < height; m += subStep )
     {
-      __m128i vsum16 = vzero;
+      __m256i vsum16 = vzero;
 
-      for (int n = 0; n < width; n += 4)
+      for( int n = 0; n < width; n += 16 )
       {
-        __m128i org = _mm_loadl_epi64((__m128i*)(pOrg + n));
-        __m128i cur = _mm_loadl_epi64((__m128i*)(pCur + n));
-        __m128i abs = _mm_abs_epi16(_mm_sub_epi16(_mm_sub_epi16(org, cur), delta));
-        vsum16 = _mm_adds_epu16(abs, vsum16);
+        __m256i org = _mm256_lddqu_si256( ( __m256i* )( pOrg + n ) );
+        __m256i cur = _mm256_lddqu_si256( ( __m256i* )( pCur + n ) );
+        __m256i abs = _mm256_abs_epi16( _mm256_sub_epi16( _mm256_sub_epi16( org, cur ), delta ) );
+        vsum16 = _mm256_adds_epi16( abs, vsum16 );
       }
 
-      vsum32 = _mm_add_epi32(vsum32, _mm_unpacklo_epi16(vsum16, vzero));
+      __m256i vsumtemp = _mm256_add_epi32( _mm256_unpacklo_epi16( vsum16, vzero ), _mm256_unpackhi_epi16( vsum16, vzero ) );
+      vsum32 = _mm256_add_epi32( vsum32, vsumtemp );
 
       pOrg += strideOrg;
       pCur += strideCur;
     }
 
-    vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e));   // 01001110
-    vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1));   // 10110001
-    sum = _mm_cvtsi128_si32(vsum32);
+    vsum32 = _mm256_hadd_epi32( vsum32, vzero );
+    vsum32 = _mm256_hadd_epi32( vsum32, vzero );
+    sum = _mm_cvtsi128_si32( _mm256_castsi256_si128( vsum32 ) ) + _mm_cvtsi128_si32( _mm256_castsi256_si128( _mm256_permute2x128_si256( vsum32, vsum32, 0x11 ) ) );
   }
-#ifdef USE_AVX2
-  else if (vext >= AVX2 && width >= 16) // multiple of 16
+  else
+#endif
+  if( !( width & 7 ) )// multiple of 8
   {
-    __m256i vzero = _mm256_setzero_si256();
-    __m256i vsum32 = vzero;
+    __m128i vzero = _mm_setzero_si128();
+    __m128i vsum32 = vzero;
+    
+    int num = width >= 128 ? 4 : ( width >= 64 ? 2 : 1 );
+    int size = width / num;
 
-    for (; height != 0; height -= subStep)
+    for( int m = 0; m < height; m += subStep )
     {
-      __m256i vsum16 = vzero;
-
-      for (int n = 0; n < width; n += 16)
+      for( int i = 0; i < num; i++ )
       {
-        __m256i org = _mm256_lddqu_si256((__m256i*)(pOrg + n));
-        __m256i cur = _mm256_lddqu_si256((__m256i*)(pCur + n));
-        vsum16 = _mm256_adds_epi16(vsum16, _mm256_sub_epi16(org, cur));
-      }
+        __m128i vsum16 = vzero;
+
+        for( int n = i * size; n < ( i + 1 ) * size; n += 8 )
+        {
+          __m128i org = _mm_lddqu_si128( ( __m128i* )( pOrg + n ) );
+          __m128i cur = _mm_lddqu_si128( ( __m128i* )( pCur + n ) );
+          vsum16 = _mm_adds_epi16( vsum16, _mm_sub_epi16( org, cur ) );
+        }
 
-      __m256i vsign = _mm256_cmpgt_epi16(vzero, vsum16);
-      __m256i vsumtemp = _mm256_add_epi32(_mm256_unpacklo_epi16(vsum16, vsign), _mm256_unpackhi_epi16(vsum16, vsign));
-      vsum32 = _mm256_add_epi32(vsum32, vsumtemp);
+        __m128i vsign = _mm_cmpgt_epi16( vzero, vsum16 );
+        __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vsign ), _mm_unpackhi_epi16( vsum16, vsign ) );
+        vsum32 = _mm_add_epi32( vsum32, vsumtemp );
+      }
 
       pOrg += strideOrg;
       pCur += strideCur;
       rowCnt++;
     }
 
-    vsum32 = _mm256_hadd_epi32(vsum32, vzero);
-    vsum32 = _mm256_hadd_epi32(vsum32, vzero);
-    deltaAvg = _mm_cvtsi128_si32(_mm256_castsi256_si128(vsum32)) + _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_permute2x128_si256(vsum32, vsum32, 0x11)));
-    deltaAvg /= width * rowCnt;
+    vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0x4e ) );   // 01001110
+    vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0xb1 ) );   // 10110001
+    deltaAvg = _mm_cvtsi128_si32( vsum32 ) / ( width * rowCnt );
 
     pOrg = (const short*)rcDtParam.org.buf;
     pCur = (const short*)rcDtParam.cur.buf;
-    height = rcDtParam.org.height;
 
-    __m256i delta = _mm256_set1_epi16(deltaAvg);
+    __m128i delta = _mm_set1_epi16( deltaAvg );
     vsum32 = vzero;
 
-    for (; height != 0; height -= subStep)
+    for( int m = 0; m < height; m += subStep )
     {
-      __m256i vsum16 = vzero;
+      __m128i vsum16 = vzero;
 
-      for (int n = 0; n < width; n += 16)
+      for( int n = 0; n < width; n += 8 )
       {
-        __m256i org = _mm256_lddqu_si256((__m256i*)(pOrg + n));
-        __m256i cur = _mm256_lddqu_si256((__m256i*)(pCur + n));
-        __m256i abs = _mm256_abs_epi16(_mm256_sub_epi16(_mm256_sub_epi16(org, cur), delta));
-        vsum16 = _mm256_adds_epi16(abs, vsum16);
+        __m128i org = _mm_lddqu_si128( ( __m128i* )( pOrg + n ) );
+        __m128i cur = _mm_lddqu_si128( ( __m128i* )( pCur + n ) );
+        __m128i abs = _mm_abs_epi16( _mm_sub_epi16( _mm_sub_epi16( org, cur ), delta ) );
+        vsum16 = _mm_adds_epu16( abs, vsum16 );
       }
 
-      __m256i vsumtemp = _mm256_add_epi32(_mm256_unpacklo_epi16(vsum16, vzero), _mm256_unpackhi_epi16(vsum16, vzero));
-      vsum32 = _mm256_add_epi32(vsum32, vsumtemp);
+      __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) );
+      vsum32 = _mm_add_epi32( vsum32, vsumtemp );
 
       pOrg += strideOrg;
       pCur += strideCur;
     }
 
-    vsum32 = _mm256_hadd_epi32(vsum32, vzero);
-    vsum32 = _mm256_hadd_epi32(vsum32, vzero);
-    sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(vsum32)) + _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_permute2x128_si256(vsum32, vsum32, 0x11)));
+    vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0x4e ) );   // 01001110
+    vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0xb1 ) );   // 10110001
+    sum = _mm_cvtsi128_si32( vsum32 );
   }
-#endif
-  else // multiple of 8
+  else if( !( width & 3 ) ) // multiple of 4
   {
     __m128i vzero = _mm_setzero_si128();
     __m128i vsum32 = vzero;
 
-    for (; height != 0; height -= subStep)
+    for( int m = 0; m < height; m += subStep )
     {
       __m128i vsum16 = vzero;
 
-      for (int n = 0; n < width; n += 8)
+      for( int n = 0; n < width; n += 4 )
       {
-        __m128i org = _mm_lddqu_si128((__m128i*)(pOrg + n));
-        __m128i cur = _mm_lddqu_si128((__m128i*)(pCur + n));
-        vsum16 = _mm_adds_epi16(vsum16, _mm_sub_epi16(org, cur));
+        __m128i org = _mm_loadl_epi64( ( __m128i* )( pOrg + n ) );
+        __m128i cur = _mm_loadl_epi64( ( __m128i* )( pCur + n ) );
+        vsum16 = _mm_adds_epi16( vsum16, _mm_sub_epi16( org, cur ) );
       }
 
-      __m128i vsign = _mm_cmpgt_epi16(vzero, vsum16);
-      __m128i vsumtemp = _mm_add_epi32(_mm_unpacklo_epi16(vsum16, vsign), _mm_unpackhi_epi16(vsum16, vsign));
-      vsum32 = _mm_add_epi32(vsum32, vsumtemp);
+      __m128i vsign = _mm_cmpgt_epi16( vzero, vsum16 );
+      vsum32 = _mm_add_epi32( vsum32, _mm_unpacklo_epi16( vsum16, vsign ) );
 
       pOrg += strideOrg;
       pCur += strideCur;
       rowCnt++;
     }
 
-    vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e));   // 01001110
-    vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1));   // 10110001
-    deltaAvg = _mm_cvtsi128_si32(vsum32) / (width * rowCnt);
+    vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0x4e ) );   // 01001110
+    vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0xb1 ) );   // 10110001
+    deltaAvg = _mm_cvtsi128_si32( vsum32 ) / ( width * rowCnt );
 
     pOrg = (const short*)rcDtParam.org.buf;
     pCur = (const short*)rcDtParam.cur.buf;
-    height = rcDtParam.org.height;
 
-    __m128i delta = _mm_set1_epi16(deltaAvg);
+    __m128i delta = _mm_set1_epi16( deltaAvg );
     vsum32 = vzero;
 
-    for (; height != 0; height -= subStep)
+    for( int m = 0; m < height; m += subStep )
     {
       __m128i vsum16 = vzero;
 
-      for (int n = 0; n < width; n += 8)
+      for( int n = 0; n < width; n += 4 )
       {
-        __m128i org = _mm_lddqu_si128((__m128i*)(pOrg + n));
-        __m128i cur = _mm_lddqu_si128((__m128i*)(pCur + n));
-        __m128i abs = _mm_abs_epi16(_mm_sub_epi16(_mm_sub_epi16(org, cur), delta));
-        vsum16 = _mm_adds_epu16(abs, vsum16);
+        __m128i org = _mm_loadl_epi64( ( __m128i* )( pOrg + n ) );
+        __m128i cur = _mm_loadl_epi64( ( __m128i* )( pCur + n ) );
+        __m128i abs = _mm_abs_epi16( _mm_sub_epi16( _mm_sub_epi16( org, cur ), delta ) );
+        vsum16 = _mm_adds_epu16( abs, vsum16 );
       }
 
-      __m128i vsumtemp = _mm_add_epi32(_mm_unpacklo_epi16(vsum16, vzero), _mm_unpackhi_epi16(vsum16, vzero));
-      vsum32 = _mm_add_epi32(vsum32, vsumtemp);
+      vsum32 = _mm_add_epi32( vsum32, _mm_unpacklo_epi16( vsum16, vzero ) );
 
       pOrg += strideOrg;
       pCur += strideCur;
     }
 
-    vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e));   // 01001110
-    vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1));   // 10110001
-    sum = _mm_cvtsi128_si32(vsum32);
+    vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0x4e ) );   // 01001110
+    vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0xb1 ) );   // 10110001
+    sum = _mm_cvtsi128_si32( vsum32 );
+  }
+  else
+  {
+    return RdCost::xGetMRSAD( rcDtParam );
   }
 
   sum <<= subShift;
@@ -2902,127 +2909,131 @@ Distortion RdCost::xGetTMErrorFull_SIMD(const DistParam& rcDtParam)
 
   // compute matching cost
   Distortion partSum = 0;
-  if (trueAfalseL)
+  if( trueAfalseL )
   {
     const int subblkWidth = iCols >> 2;
-
-    if (subblkWidth & 7) // multiple of 4
+#ifdef USE_AVX2
+    if( vext >= AVX2 && !( subblkWidth & 15 ) ) // multiple of 16
     {
-      __m128i vzero  = _mm_setzero_si128();
-      __m128i vsum32 = vzero;
-      __m128i delta  = mr ? _mm_set1_epi16(deltaMean) : vzero;
+      __m256i vzero = _mm256_setzero_si256();
+      __m256i vsum32 = vzero;
+      __m256i delta = mr ? _mm256_set1_epi16( deltaMean ) : vzero;
 
-      for (uint32_t j = 0; j < iRows; j += iSubStep)
+      for( uint32_t j = 0; j < iRows; j += iSubStep )
       {
-        __m128i vsum32row = vzero;
-        for (uint32_t m = 0, n = 0; m < iCols; m += (iCols >> 2), n++)
+        __m256i vsum32row = vzero;
+        for( uint32_t m = 0, n = 0; m < iCols; m += ( iCols >> 2 ), n++ )
         {
-          __m128i vsum32subblk = vzero;
-          for (uint32_t i = m; i < m + (iCols >> 2); i += 4)
+          __m256i vsum32subblk = vzero;
+          for( uint32_t i = m; i < m + ( iCols >> 2 ); i += 16 )
           {
-            __m128i vsum16 = vzero;
-            // 4 samples per iteration
+            __m256i vsum16 = vzero;
+            // 16 samples per iteration
             {
-              __m128i cur = _mm_loadl_epi64((__m128i*)(piCur + i));
-              __m128i ref = _mm_loadl_epi64((__m128i*)(piRef + i));
-              vsum16      = mr ? _mm_abs_epi16(_mm_sub_epi16(_mm_sub_epi16(cur, ref), delta)) : _mm_abs_epi16(_mm_sub_epi16(cur, ref));
+              __m256i cur = _mm256_lddqu_si256( ( __m256i* )( piCur + i ) );
+              __m256i ref = _mm256_lddqu_si256( ( __m256i* )( piRef + i ) );
+              vsum16 = mr ? _mm256_abs_epi16( _mm256_sub_epi16( _mm256_sub_epi16( cur, ref ), delta ) ) : _mm256_abs_epi16( _mm256_sub_epi16( cur, ref ) );
             }
 
-            vsum32subblk = _mm_add_epi32(vsum32subblk, _mm_unpacklo_epi16(vsum16, vzero));
+            __m256i vsumtemp = _mm256_add_epi32( _mm256_unpacklo_epi16( vsum16, vzero ), _mm256_unpackhi_epi16( vsum16, vzero ) );
+            vsum32subblk = _mm256_add_epi32( vsum32subblk, vsumtemp );
           }
 
-          vsum32row = _mm_add_epi32(vsum32row, _mm_slli_epi32(vsum32subblk, tplWeightS[n]));
+          vsum32row = _mm256_add_epi32( vsum32row, _mm256_slli_epi32( vsum32subblk, tplWeightS[n] ) );
         }
 
-        vsum32 = _mm_add_epi32(vsum32, _mm_slli_epi32(vsum32row, tplWeightD[j]));
+        vsum32 = _mm256_add_epi32( vsum32, _mm256_slli_epi32( vsum32row, tplWeightD[j] ) );
 
         piCur += iStrideCur;
         piRef += iStrideRef;
       }
 
-      vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e));   // 01001110
-      vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1));   // 10110001
-      partSum = _mm_cvtsi128_si32(vsum32);
+      vsum32 = _mm256_hadd_epi32( vsum32, vzero );
+      vsum32 = _mm256_hadd_epi32( vsum32, vzero );
+      partSum = _mm_cvtsi128_si32( _mm256_castsi256_si128( vsum32 ) ) + _mm_cvtsi128_si32( _mm256_castsi256_si128( _mm256_permute2x128_si256( vsum32, vsum32, 0x11 ) ) );
     }
-#ifdef USE_AVX2
-    else if (vext >= AVX2 && subblkWidth >= 16) // multiple of 16
+    else
+#endif
+    if( !( subblkWidth & 7 ) ) // multiple of 8
     {
-      __m256i vzero  = _mm256_setzero_si256();
-      __m256i vsum32 = vzero;
-      __m256i delta  = mr ? _mm256_set1_epi16(deltaMean) : vzero;
+      __m128i vzero = _mm_setzero_si128();
+      __m128i vsum32 = vzero;
+      __m128i delta = mr ? _mm_set1_epi16( deltaMean ) : vzero;
 
-      for (uint32_t j = 0; j < iRows; j += iSubStep)
+      for( uint32_t j = 0; j < iRows; j += iSubStep )
       {
-        __m256i vsum32row = vzero;
-        for (uint32_t m = 0, n = 0; m < iCols; m += (iCols >> 2), n++)
+        __m128i vsum32row = vzero;
+        for( uint32_t m = 0, n = 0; m < iCols; m += ( iCols >> 2 ), n++ )
         {
-          __m256i vsum32subblk = vzero;
-          for (uint32_t i = m; i < m + (iCols >> 2); i += 16)
+          __m128i vsum32subblk = vzero;
+          for( uint32_t i = m; i < m + ( iCols >> 2 ); i += 8 )
           {
-            __m256i vsum16 = vzero;
-            // 16 samples per iteration
+            __m128i vsum16 = vzero;
+            // 8 samples per iteration
             {
-              __m256i cur = _mm256_lddqu_si256((__m256i*)(piCur + i));
-              __m256i ref = _mm256_lddqu_si256((__m256i*)(piRef + i));
-              vsum16      = mr ? _mm256_abs_epi16(_mm256_sub_epi16(_mm256_sub_epi16(cur, ref), delta)) : _mm256_abs_epi16(_mm256_sub_epi16(cur, ref));
+              __m128i cur = _mm_lddqu_si128( ( __m128i* )( piCur + i ) );
+              __m128i ref = _mm_lddqu_si128( ( __m128i* )( piRef + i ) );
+              vsum16 = mr ? _mm_abs_epi16( _mm_sub_epi16( _mm_sub_epi16( cur, ref ), delta ) ) : _mm_abs_epi16( _mm_sub_epi16( cur, ref ) );
             }
 
-            __m256i vsumtemp = _mm256_add_epi32(_mm256_unpacklo_epi16(vsum16, vzero), _mm256_unpackhi_epi16(vsum16, vzero));
-            vsum32subblk = _mm256_add_epi32(vsum32subblk, vsumtemp);
+            __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) );
+            vsum32subblk = _mm_add_epi32( vsum32subblk, vsumtemp );
           }
 
-          vsum32row = _mm256_add_epi32(vsum32row, _mm256_slli_epi32(vsum32subblk, tplWeightS[n]));
+          vsum32row = _mm_add_epi32( vsum32row, _mm_slli_epi32( vsum32subblk, tplWeightS[n] ) );
         }
 
-        vsum32 = _mm256_add_epi32(vsum32, _mm256_slli_epi32(vsum32row, tplWeightD[j]));
+        vsum32 = _mm_add_epi32( vsum32, _mm_slli_epi32( vsum32row, tplWeightD[j] ) );
 
         piCur += iStrideCur;
         piRef += iStrideRef;
       }
 
-      vsum32  = _mm256_hadd_epi32(vsum32, vzero);
-      vsum32  = _mm256_hadd_epi32(vsum32, vzero);
-      partSum = _mm_cvtsi128_si32(_mm256_castsi256_si128(vsum32)) + _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_permute2x128_si256(vsum32, vsum32, 0x11)));
+      vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0x4e ) );   // 01001110
+      vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0xb1 ) );   // 10110001
+      partSum = _mm_cvtsi128_si32( vsum32 );
     }
-#endif
-    else // multiple of 8
+    else if( !( subblkWidth & 3 ) ) // multiple of 4
     {
-      __m128i vzero  = _mm_setzero_si128();
+      __m128i vzero = _mm_setzero_si128();
       __m128i vsum32 = vzero;
-      __m128i delta  = mr ? _mm_set1_epi16(deltaMean) : vzero;
+      __m128i delta = mr ? _mm_set1_epi16( deltaMean ) : vzero;
 
-      for (uint32_t j = 0; j < iRows; j += iSubStep)
+      for( uint32_t j = 0; j < iRows; j += iSubStep )
       {
         __m128i vsum32row = vzero;
-        for (uint32_t m = 0, n = 0; m < iCols; m += (iCols >> 2), n++)
+        for( uint32_t m = 0, n = 0; m < iCols; m += ( iCols >> 2 ), n++ )
         {
           __m128i vsum32subblk = vzero;
-          for (uint32_t i = m; i < m + (iCols >> 2); i += 8)
+          for( uint32_t i = m; i < m + ( iCols >> 2 ); i += 4 )
           {
             __m128i vsum16 = vzero;
-            // 8 samples per iteration
+            // 4 samples per iteration
             {
-              __m128i cur = _mm_lddqu_si128((__m128i*)(piCur + i));
-              __m128i ref = _mm_lddqu_si128((__m128i*)(piRef + i));
-              vsum16      = mr ? _mm_abs_epi16(_mm_sub_epi16(_mm_sub_epi16(cur, ref), delta)) : _mm_abs_epi16(_mm_sub_epi16(cur, ref));
+              __m128i cur = _mm_loadl_epi64( ( __m128i* )( piCur + i ) );
+              __m128i ref = _mm_loadl_epi64( ( __m128i* )( piRef + i ) );
+              vsum16 = mr ? _mm_abs_epi16( _mm_sub_epi16( _mm_sub_epi16( cur, ref ), delta ) ) : _mm_abs_epi16( _mm_sub_epi16( cur, ref ) );
             }
 
-            __m128i vsumtemp = _mm_add_epi32(_mm_unpacklo_epi16(vsum16, vzero), _mm_unpackhi_epi16(vsum16, vzero));
-            vsum32subblk = _mm_add_epi32(vsum32subblk, vsumtemp);
+            vsum32subblk = _mm_add_epi32( vsum32subblk, _mm_unpacklo_epi16( vsum16, vzero ) );
           }
 
-          vsum32row = _mm_add_epi32(vsum32row, _mm_slli_epi32(vsum32subblk, tplWeightS[n]));
+          vsum32row = _mm_add_epi32( vsum32row, _mm_slli_epi32( vsum32subblk, tplWeightS[n] ) );
         }
 
-        vsum32 = _mm_add_epi32(vsum32, _mm_slli_epi32(vsum32row, tplWeightD[j]));
+        vsum32 = _mm_add_epi32( vsum32, _mm_slli_epi32( vsum32row, tplWeightD[j] ) );
 
         piCur += iStrideCur;
         piRef += iStrideRef;
       }
 
-      vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0x4e));   // 01001110
-      vsum32 = _mm_add_epi32(vsum32, _mm_shuffle_epi32(vsum32, 0xb1));   // 10110001
-      partSum = _mm_cvtsi128_si32(vsum32);
+      vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0x4e ) );   // 01001110
+      vsum32 = _mm_add_epi32( vsum32, _mm_shuffle_epi32( vsum32, 0xb1 ) );   // 10110001
+      partSum = _mm_cvtsi128_si32( vsum32 );
+    }
+    else
+    {
+      return RdCost::xGetTMErrorFull<tplSize, trueAfalseL, mr>( rcDtParam );
     }
   }
   else
@@ -3038,12 +3049,10 @@ Distortion RdCost::xGetTMErrorFull_SIMD(const DistParam& rcDtParam)
       {
         __m128i vsum16 = vzero;
         // 4 samples per row
-        {
-          __m128i cur = _mm_loadl_epi64((__m128i*)piCur);
-          __m128i ref = _mm_loadl_epi64((__m128i*)piRef);
-          __m128i abs = mr ? _mm_abs_epi16(_mm_sub_epi16(_mm_sub_epi16(cur, ref), delta)) : _mm_abs_epi16(_mm_sub_epi16(cur, ref));
-          vsum16 = _mm_adds_epu16(abs, vsum16);
-        }
+        __m128i cur = _mm_loadl_epi64( ( __m128i* )piCur );
+        __m128i ref = _mm_loadl_epi64( ( __m128i* )piRef );
+        __m128i abs = mr ? _mm_abs_epi16( _mm_sub_epi16( _mm_sub_epi16( cur, ref ), delta ) ) : _mm_abs_epi16( _mm_sub_epi16( cur, ref ) );
+        vsum16 = _mm_adds_epu16( abs, vsum16 );
 
         vsum32subblks = _mm_add_epi32(vsum32subblks, _mm_unpacklo_epi16(vsum16, vzero));